From 994c4d9086d7a8481b98e97eb8c0fb8204f11453 Mon Sep 17 00:00:00 2001 From: Felipe Mello Date: Fri, 25 Oct 2024 20:21:34 -0700 Subject: [PATCH] updated configs + qlora --- recipes/configs/llama3_2_vision/11B_lora.yaml | 2 +- .../11B_lora_single_device.yaml | 2 +- .../configs/llama3_2_vision/11B_qlora.yaml | 13 ++- .../11B_qlora_single_device.yaml | 15 +-- recipes/configs/llama3_2_vision/90B_full.yaml | 2 - recipes/configs/llama3_2_vision/90B_lora.yaml | 4 +- .../configs/llama3_2_vision/90B_qlora.yaml | 91 +++++++++++++++++++ torchtune/_recipe_registry.py | 8 +- 8 files changed, 117 insertions(+), 20 deletions(-) create mode 100644 recipes/configs/llama3_2_vision/90B_qlora.yaml diff --git a/recipes/configs/llama3_2_vision/11B_lora.yaml b/recipes/configs/llama3_2_vision/11B_lora.yaml index 6f18eaa08..3f542fa25 100644 --- a/recipes/configs/llama3_2_vision/11B_lora.yaml +++ b/recipes/configs/llama3_2_vision/11B_lora.yaml @@ -47,7 +47,7 @@ checkpointer: output_dir: /tmp/Llama-3.2-11B-Vision-Instruct/ model_type: LLAMA3_VISION resume_from_checkpoint: False -save_adapter_weights_only: True # Set to false to save the whole model + adapter merged +save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only. # Dataset dataset: diff --git a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml index 8b815144c..6bec024e3 100644 --- a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml +++ b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml @@ -45,7 +45,7 @@ checkpointer: output_dir: /tmp/Llama-3.2-11B-Vision-Instruct/ model_type: LLAMA3_VISION resume_from_checkpoint: False -save_adapter_weights_only: True # Set to false to save the whole model + adapter merged +save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only. # Dataset dataset: diff --git a/recipes/configs/llama3_2_vision/11B_qlora.yaml b/recipes/configs/llama3_2_vision/11B_qlora.yaml index 1217fb367..dd3cf601c 100644 --- a/recipes/configs/llama3_2_vision/11B_qlora.yaml +++ b/recipes/configs/llama3_2_vision/11B_qlora.yaml @@ -2,7 +2,7 @@ # using a Llama3.2 11B Vision Instruct model # # This config assumes that you've run the following command before launching: -# tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct +# tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct --ignore-patterns "original/consolidated*" # # To launch on 2 devices, run the following command from root: # tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_2_vision/11B_qlora @@ -37,13 +37,16 @@ tokenizer: # Checkpointer checkpointer: - _component_: torchtune.training.FullModelMetaCheckpointer - checkpoint_dir: /tmp/Llama-3.2-11B-Vision-Instruct/original/ - checkpoint_files: [consolidated.pth] + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Llama-3.2-11B-Vision-Instruct/ + checkpoint_files: + filename_format: model-{}-of-{}.safetensors + max_filename: "00005" recipe_checkpoint: null output_dir: /tmp/Llama-3.2-11B-Vision-Instruct/ model_type: LLAMA3_VISION resume_from_checkpoint: False +save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only. # Dataset dataset: @@ -62,7 +65,7 @@ optimizer: _component_: torch.optim.AdamW fused: True weight_decay: 0.01 - lr: 2e-5 + lr: 1e-4 lr_scheduler: _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 diff --git a/recipes/configs/llama3_2_vision/11B_qlora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_qlora_single_device.yaml index b12d51237..6131abe8b 100644 --- a/recipes/configs/llama3_2_vision/11B_qlora_single_device.yaml +++ b/recipes/configs/llama3_2_vision/11B_qlora_single_device.yaml @@ -2,7 +2,7 @@ # using a Llama3.2 11B Vision Instruct model # # This config assumes that you've run the following command before launching: -# tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct +# tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct --ignore-patterns "original/consolidated*" # # To launch on a single device, run the following command from root: # tune run lora_finetune_single_device --config llama3_2_vision/11B_qlora_single_device @@ -36,13 +36,16 @@ tokenizer: # Checkpointer checkpointer: - _component_: torchtune.training.FullModelMetaCheckpointer - checkpoint_dir: /tmp/Llama-3.2-11B-Vision-Instruct/original/ - checkpoint_files: [consolidated.pth] + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Llama-3.2-11B-Vision-Instruct/ + checkpoint_files: + filename_format: model-{}-of-{}.safetensors + max_filename: "00005" recipe_checkpoint: null output_dir: /tmp/Llama-3.2-11B-Vision-Instruct/ model_type: LLAMA3_VISION resume_from_checkpoint: False +save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only. # Dataset dataset: @@ -61,8 +64,8 @@ optimizer: _component_: torch.optim.AdamW fused: True weight_decay: 0.01 - lr: 2e-5 -optimizer_in_bwd: False + lr: 1e-4 + lr_scheduler: _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 diff --git a/recipes/configs/llama3_2_vision/90B_full.yaml b/recipes/configs/llama3_2_vision/90B_full.yaml index 5575785dd..9eee41e49 100644 --- a/recipes/configs/llama3_2_vision/90B_full.yaml +++ b/recipes/configs/llama3_2_vision/90B_full.yaml @@ -12,8 +12,6 @@ # tune run --nproc_per_node 4 full_finetune_distributed --config llama3_2_vision/90B_full checkpointer.checkpoint_dir= # # This config works best when the model is being fine-tuned on 2+ GPUs. -# Single device full finetuning requires more memory optimizations. It's -# best to use 90B_full_single_device.yaml for those cases. # Model arguments model: diff --git a/recipes/configs/llama3_2_vision/90B_lora.yaml b/recipes/configs/llama3_2_vision/90B_lora.yaml index 95e166e75..8083357a2 100644 --- a/recipes/configs/llama3_2_vision/90B_lora.yaml +++ b/recipes/configs/llama3_2_vision/90B_lora.yaml @@ -12,8 +12,6 @@ # tune run --nproc_per_node 8 lora_finetune_distributed --config llama3_2_vision/90B_lora checkpointer.checkpoint_dir= # # This config works best when the model is being fine-tuned on 2+ GPUs. -# For single device LoRA finetuning please use 90B_lora_single_device.yaml -# or 90B_qlora_single_device.yaml # Model arguments model: @@ -47,7 +45,7 @@ checkpointer: output_dir: /tmp/Llama-3.2-90B-Vision-Instruct/ model_type: LLAMA3_VISION resume_from_checkpoint: False -save_adapter_weights_only: True # Set to false to save the whole model + adapter merged +save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only. # Dataset dataset: diff --git a/recipes/configs/llama3_2_vision/90B_qlora.yaml b/recipes/configs/llama3_2_vision/90B_qlora.yaml new file mode 100644 index 000000000..868b47da8 --- /dev/null +++ b/recipes/configs/llama3_2_vision/90B_qlora.yaml @@ -0,0 +1,91 @@ +# Config for multi-device QLoRA finetuning in lora_finetune_distributed.py +# using a Llama3.2 90B Vision Instruct model +# +# This config assumes that you've run the following command before launching: +# tune download meta-llama/Llama-3.2-90B-Vision-Instruct --output-dir /tmp/Llama-3.2-90B-Vision-Instruct --ignore-patterns "original/consolidated*" +# +# To launch on 2 devices, run the following command from root: +# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_2_vision/90B_qlora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training: +# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_2_vision/90B_qlora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device QLoRA finetuning please use 90B_qlora_single_device.yaml + +# Model arguments +model: + _component_: torchtune.models.llama3_2_vision.qlora_llama3_2_vision_90b + decoder_trainable: "frozen" + encoder_trainable: "lora" + fusion_trainable: "lora" + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + lora_dropout: 0.0 + image_size: 560 # Make sure this matches the image_size in tokenizer + +# Transform +tokenizer: + _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform + path: /tmp/Llama-3.2-90B-Vision-Instruct/original/tokenizer.model + image_size: 560 + max_seq_len: 8192 + +# Checkpointer +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Llama-3.2-90B-Vision-Instruct/ + checkpoint_files: + filename_format: model-{}-of-{}.safetensors + max_filename: "00037" + recipe_checkpoint: null + output_dir: /tmp/Llama-3.2-90B-Vision-Instruct/ + model_type: LLAMA3_VISION +resume_from_checkpoint: False +save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only. + +# Dataset +dataset: + _component_: torchtune.datasets.multimodal.the_cauldron_dataset + subset: ocrvqa +seed: null +shuffle: True +collate_fn: torchtune.data.padded_collate_tiled_images_and_mask + +# Fine-tuning arguments +epochs: 1 +max_steps_per_epoch: null +batch_size: 2 +gradient_accumulation_steps: 4 +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 1e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +clip_grad_norm: 1.0 +compile: False # set it to True for better memory and performance + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True +enable_activation_offloading: False +dtype: bf16 + +# Logging +output_dir: /tmp/qlora-llama3.2-vision-finetune +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: /tmp/Llama-3.2-90B-Vision-Instruct/logs +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index 5058d9b9e..7602d8e8c 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -297,13 +297,17 @@ class Recipe: name="llama3_2_vision/11B_lora", file_path="llama3_2_vision/11B_lora.yaml", ), + Config( + name="llama3_2_vision/11B_qlora", + file_path="llama3_2_vision/11B_qlora.yaml", + ), Config( name="llama3_2_vision/90B_lora", file_path="llama3_2_vision/90B_lora.yaml", ), Config( - name="llama3_2_vision/11B_qlora", - file_path="llama3_2_vision/11B_qlora.yaml", + name="llama3_2_vision/90B_qlora", + file_path="llama3_2_vision/90B_qlora.yaml", ), ], supports_distributed=True,