Activation offloading for fullfinetuning + fix tied embedding (#1847)

Co-authored-by: Felipe Mello <[email protected]>
pytorch · Oct 30, 2024 · e99b890 · e99b890
1 parent a1bcb97
commit e99b890
Show file tree

Hide file tree

Showing 89 changed files with 384 additions and 103 deletions.
diff --git a/recipes/configs/code_llama2/7B_full_low_memory.yaml b/recipes/configs/code_llama2/7B_full_low_memory.yaml
@@ -69,6 +69,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: True  # True reduces memory
 dtype: bf16
 
 # Logging

diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -77,7 +77,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 dtype: bf16
 
 # Logging

diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
@@ -76,7 +76,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 dtype: bf16
 
 # Logging

diff --git a/recipes/configs/dev/8B_full_experimental.yaml b/recipes/configs/dev/8B_full_experimental.yaml
@@ -65,6 +65,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: False
+enable_activation_offloading: False  # True reduces memory
 ac_mode: 'selective'  # ['selective', 'full']
 ac_option: 2 # [int] = ac every positive int layer
 memory_efficient_fsdp_wrap: False

diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml
@@ -62,6 +62,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml
@@ -74,6 +74,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml
@@ -73,7 +73,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml
@@ -73,7 +73,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/gemma/7B_full.yaml b/recipes/configs/gemma/7B_full.yaml
@@ -64,6 +64,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml
@@ -76,6 +76,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml
@@ -75,7 +75,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml
@@ -75,7 +75,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml
@@ -66,6 +66,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
@@ -89,3 +89,4 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: False
+enable_activation_offloading: False  # True reduces memory
diff --git a/recipes/configs/llama2/13B_qlora_single_device.yaml b/recipes/configs/llama2/13B_qlora_single_device.yaml
@@ -85,7 +85,7 @@ device: cuda
 dtype: bf16
 
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training

diff --git a/recipes/configs/llama2/70B_lora.yaml b/recipes/configs/llama2/70B_lora.yaml
@@ -88,3 +88,4 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
diff --git a/recipes/configs/llama2/70B_qlora.yaml b/recipes/configs/llama2/70B_qlora.yaml
@@ -98,3 +98,4 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml
@@ -65,6 +65,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/llama2/7B_full_low_memory.yaml b/recipes/configs/llama2/7B_full_low_memory.yaml
@@ -70,6 +70,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: True  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
@@ -85,6 +85,7 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: False
+enable_activation_offloading: False  # True reduces memory
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training

diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml
@@ -86,7 +86,7 @@ dtype: bf16
 
 # Activations Memory
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training

diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml
@@ -89,3 +89,4 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml
@@ -85,7 +85,7 @@ dtype: bf16
 
 # Activations Memory
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training

diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml
@@ -93,6 +93,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 custom_sharded_layers: ['tok_embeddings', 'output']
 fsdp_cpu_offload: True
 compile: False # pytorch compile, set to true for perf/memory improvement

diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml
@@ -104,3 +104,4 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
diff --git a/recipes/configs/llama3/8B_dora.yaml b/recipes/configs/llama3/8B_dora.yaml
@@ -79,3 +79,4 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: False
+enable_activation_offloading: False  # True reduces memory
diff --git a/recipes/configs/llama3/8B_dora_single_device.yaml b/recipes/configs/llama3/8B_dora_single_device.yaml
@@ -81,6 +81,7 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training

diff --git a/recipes/configs/llama3/8B_full.yaml b/recipes/configs/llama3/8B_full.yaml
@@ -65,6 +65,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 custom_sharded_layers: ['tok_embeddings', 'output']
 
 # Reduced precision

diff --git a/recipes/configs/llama3/8B_full_single_device.yaml b/recipes/configs/llama3/8B_full_single_device.yaml
@@ -69,6 +69,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/llama3/8B_lora.yaml b/recipes/configs/llama3/8B_lora.yaml
@@ -84,3 +84,4 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: False
+enable_activation_offloading: False  # True reduces memory
diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml
@@ -85,7 +85,7 @@ dtype: bf16
 
 # Activations Memory
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Profiler (disabled)
 profiler:

diff --git a/recipes/configs/llama3/8B_qdora_single_device.yaml b/recipes/configs/llama3/8B_qdora_single_device.yaml
@@ -82,6 +82,7 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training

diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml
@@ -84,7 +84,7 @@ dtype: bf16
 
 # Activations Memory
 enable_activation_checkpointing: True
-enable_activation_offloading: True
+enable_activation_offloading: False  # True reduces memory
 
 # Profiler (disabled)
 profiler:

diff --git a/recipes/configs/llama3_1/405B_qlora.yaml b/recipes/configs/llama3_1/405B_qlora.yaml
@@ -82,3 +82,4 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml
@@ -95,6 +95,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 custom_sharded_layers: ['tok_embeddings', 'output']
 fsdp_cpu_offload: True
 compile: False # pytorch compile, set to true for perf/memory improvement

diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml
@@ -103,3 +103,4 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
diff --git a/recipes/configs/llama3_1/8B_full.yaml b/recipes/configs/llama3_1/8B_full.yaml
@@ -68,6 +68,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 custom_sharded_layers: ['tok_embeddings', 'output']
 compile: False # pytorch compile, set to true for perf/memory improvement
 

diff --git a/recipes/configs/llama3_1/8B_full_single_device.yaml b/recipes/configs/llama3_1/8B_full_single_device.yaml
@@ -69,6 +69,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml
@@ -87,3 +87,4 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: False
+enable_activation_offloading: False  # True reduces memory
diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml
@@ -88,7 +88,7 @@ dtype: bf16
 
 # Activations Memory
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Profiler (disabled)
 profiler:

diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml
@@ -87,7 +87,7 @@ dtype: bf16
 
 # Activations Offloading
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Profiler (disabled)
 profiler:

diff --git a/recipes/configs/llama3_2/1B_full.yaml b/recipes/configs/llama3_2/1B_full.yaml
@@ -65,6 +65,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: False
+enable_activation_offloading: False  # True reduces memory
 compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Reduced precision

diff --git a/recipes/configs/llama3_2/1B_full_single_device.yaml b/recipes/configs/llama3_2/1B_full_single_device.yaml
@@ -66,6 +66,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: False
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/llama3_2/1B_lora.yaml b/recipes/configs/llama3_2/1B_lora.yaml
@@ -84,3 +84,4 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: False
+enable_activation_offloading: False  # True reduces memory
diff --git a/recipes/configs/llama3_2/1B_lora_single_device.yaml b/recipes/configs/llama3_2/1B_lora_single_device.yaml
@@ -85,7 +85,7 @@ dtype: bf16
 
 # Activations Memory
 enable_activation_checkpointing: False
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Profiler (disabled)
 profiler:

diff --git a/recipes/configs/llama3_2/1B_qlora_single_device.yaml b/recipes/configs/llama3_2/1B_qlora_single_device.yaml
@@ -84,7 +84,7 @@ dtype: bf16
 
 # Activations Memory
 enable_activation_checkpointing: False
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Profiler (disabled)
 profiler:

diff --git a/recipes/configs/llama3_2/3B_full.yaml b/recipes/configs/llama3_2/3B_full.yaml
@@ -65,6 +65,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Reduced precision

diff --git a/recipes/configs/llama3_2/3B_full_single_device.yaml b/recipes/configs/llama3_2/3B_full_single_device.yaml
@@ -67,6 +67,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16

diff --git a/recipes/configs/llama3_2/3B_lora.yaml b/recipes/configs/llama3_2/3B_lora.yaml
@@ -85,3 +85,4 @@ log_peak_memory_stats: True
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: False
+enable_activation_offloading: False  # True reduces memory
diff --git a/recipes/configs/llama3_2/3B_lora_single_device.yaml b/recipes/configs/llama3_2/3B_lora_single_device.yaml
@@ -86,7 +86,7 @@ dtype: bf16
 
 # Activations Memory
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Profiler (disabled)
 profiler:

diff --git a/recipes/configs/llama3_2/3B_qlora_single_device.yaml b/recipes/configs/llama3_2/3B_qlora_single_device.yaml
@@ -85,7 +85,7 @@ dtype: bf16
 
 # Activations Memory
 enable_activation_checkpointing: True
-enable_activation_offloading: False
+enable_activation_offloading: False  # True reduces memory
 
 # Profiler (disabled)
 profiler:

diff --git a/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml b/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml
@@ -106,7 +106,6 @@ dtype: bf16
 
 # Activations Memory
 enable_activation_checkpointing: False
-enable_activation_offloading: False
 
 # Profiler (disabled)
 profiler: