Zero copy initialization of models onto training workers for LLMs (#3469

) Co-authored-by: Geoffrey Angus <[email protected]> Co-authored-by: Travis Addair <[email protected]>
ludwig-ai · Jul 22, 2023 · 8696a72 · 8696a72
1 parent 354627a
commit 8696a72
Show file tree

Hide file tree

Showing 11 changed files with 221 additions and 24 deletions.
diff --git a/ludwig/api.py b/ludwig/api.py
@@ -638,12 +638,13 @@ def on_epoch_end(self, trainer, progress_tracker, save_path):
  test_set=test_set,
  save_path=model_dir,
  )
+ (self.model, train_trainset_stats, train_valiset_stats, train_testset_stats) = train_stats
 
  # Calibrates output feature probabilities on validation set if calibration is enabled.
  # Must be done after training, and before final model parameters are saved.
  if self.backend.is_coordinator():
  calibrator = Calibrator(
- trainer.model,
+ self.model,
  self.backend,
  batch_size=trainer.eval_batch_size,
  )
@@ -684,7 +685,6 @@ def on_epoch_end(self, trainer, progress_tracker, save_path):
  # List[TrainerMetric], with one entry per training checkpoint, according to steps_per_checkpoint.
  # We reduce the dictionary of TrainerMetrics to a simple list of floats for interfacing with Ray
  # Tune.
- (self.model, train_trainset_stats, train_valiset_stats, train_testset_stats) = train_stats
  train_stats = TrainingStats(
  metric_utils.reduce_trainer_metrics_dict(train_trainset_stats),
  metric_utils.reduce_trainer_metrics_dict(train_valiset_stats),

diff --git a/ludwig/backend/ray.py b/ludwig/backend/ray.py
@@ -194,7 +194,10 @@ def train_fn(
  if test_shard is not None:
  test_shard = RayDatasetShard(test_shard, features, training_set_metadata)
 
- model = ray.get(model_ref)
+ # Deserialize the model (minus weights) from Plasma
+ # Extract the weights from Plasma (without copying data)
+ # Load the weights back into the model in-place on the current device (CPU)
+ model = distributed.replace_model_from_serialization(ray.get(model_ref))
  model = distributed.to_device(model)
 
  trainer = remote_trainer_cls(
@@ -339,6 +342,7 @@ def __init__(self, trainer_kwargs: Dict[str, Any]) -> None:
  trainer_kwargs = copy.copy(trainer_kwargs)
  self.backend_config = trainer_kwargs.pop("backend", None)
  self.strategy = trainer_kwargs.pop("strategy", get_default_strategy_name())
+ self.dist_strategy = get_dist_strategy(self.strategy)
 
  if "max_retries" in trainer_kwargs:
  logger.warning("`max_retries` is no longer supported as a trainer argument in Ray backend. Ignoring it.")
@@ -408,7 +412,7 @@ def run(
 
  callbacks = callbacks or []
 
- trainer_cls, kwargs = get_dist_strategy(self.strategy).get_trainer_cls(self.backend_config)
+ trainer_cls, kwargs = self.dist_strategy.get_trainer_cls(self.backend_config)
  train_loop_config = {**config, "distributed_strategy": self.strategy}
  trainer = trainer_cls(
  train_loop_per_worker=train_loop_per_worker,
@@ -475,11 +479,18 @@ def train(
  stream_window_size["test"] = test_set.window_size_bytes
 
  with create_runner(**self.trainer_kwargs) as runner:
+ # Extract weights as numpy tensors and place them in the Ray object store.
+ # If we store the weights of a model as NumPy arrays on Plasma, we can access those
+ # weights directly out of Plasma’s shared memory segments, without making any copies.
+ # This enables zero copy model loading on each training worker using shared
+ # memory from the Ray object store for model initialization.
+ dist_strategy = runner.dist_strategy
+ model_ref = ray.put(dist_strategy.extract_model_for_serialization(self.model))
  trainer_results = runner.run(
  lambda config: train_fn(**config),
  config={
  "executable_kwargs": executable_kwargs,
- "model_ref": ray.put(self.model),
+ "model_ref": model_ref,
  "remote_trainer_cls": self.remote_trainer_cls,
  **kwargs,
  },
@@ -489,6 +500,13 @@ def train(
  stream_window_size=stream_window_size,
  )
 
+ # re-register the weights of the model object in the main process
+ self.model = dist_strategy.replace_model_from_serialization(ray.get(model_ref))
+
+ # ensure module is initialized exactly as it is in the trainer process
+ # so that the state dict can be loaded back into the model correctly.
+ self.model.prepare_for_training()
+
  # Set validation field and metric used by trainer
  self._validation_field = trainer_results.metrics["validation_field"]
  self._validation_metric = trainer_results.metrics["validation_metric"]

diff --git a/ludwig/distributed/base.py b/ludwig/distributed/base.py
@@ -1,6 +1,6 @@
 import contextlib
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Optional, Tuple, Type, TYPE_CHECKING
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TYPE_CHECKING, Union
 
 import torch
 from torch import nn
@@ -182,6 +182,15 @@ def create_checkpoint_handle(
 
  return MultiNodeCheckpoint(self, model, optimizer, scheduler)
 
+ @classmethod
+ def extract_model_for_serialization(cls, model: nn.Module) -> Union[nn.Module, Tuple[nn.Module, List[Dict]]]:
+ return model
+
+ @classmethod
+ def replace_model_from_serialization(cls, state: Union[nn.Module, Tuple[nn.Module, List[Dict]]]) -> nn.Module:
+ assert isinstance(state, nn.Module)
+ return state
+
 
 class LocalStrategy(DistributedStrategy):
  def prepare(

diff --git a/ludwig/distributed/deepspeed.py b/ludwig/distributed/deepspeed.py
@@ -1,7 +1,7 @@
 import logging
 import os
 import warnings
-from typing import Any, Dict, Mapping, Optional, Tuple, TYPE_CHECKING
+from typing import Any, Dict, List, Mapping, Optional, Tuple, TYPE_CHECKING, Union
 
 import deepspeed
 import deepspeed.comm
@@ -14,6 +14,7 @@
 from ludwig.distributed.ddp import DDPStrategy
 from ludwig.modules.optimization_modules import get_optimizer_class_and_kwargs
 from ludwig.utils.checkpoint_utils import Checkpoint
+from ludwig.utils.model_utils import extract_tensors, replace_tensors
 
 if TYPE_CHECKING:
  from ludwig.modules.lr_scheduler import LRScheduler
@@ -219,3 +220,14 @@ def get_state_for_inference(self, save_path: str, device: Optional[torch.device]
  save_path, load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True
  )
  return self.model.module.cpu().state_dict()
+
+ @classmethod
+ def extract_model_for_serialization(cls, model: nn.Module) -> Union[nn.Module, Tuple[nn.Module, List[Dict]]]:
+ return extract_tensors(model)
+
+ @classmethod
+ def replace_model_from_serialization(cls, state: Union[nn.Module, Tuple[nn.Module, List[Dict]]]) -> nn.Module:
+ assert isinstance(state, tuple)
+ model, model_weights = state
+ replace_tensors(model, model_weights, torch.device("cpu"))
+ return model
diff --git a/ludwig/models/llm.py b/ludwig/models/llm.py
@@ -89,17 +89,20 @@ def __init__(
  self._random_seed = random_seed
 
  self.model_name = self.config_obj.base_model
+ self.model_config = AutoConfig.from_pretrained(self.config_obj.base_model)
 
  logger.info("Loading large language model...")
  self.model = AutoModelForCausalLM.from_pretrained(self.config_obj.base_model)
- self.curr_device = torch.device("cpu") # model initially loaded onto cpu
+
+ # Model initially loaded onto cpu
+ self.curr_device = torch.device("cpu")
  logger.info("Done.")
 
  # Determines the maximum length of the context (input + output tokens)
- if hasattr(self.model.config, "max_sequence_length"):
- self.context_len = self.model.config.max_sequence_length
- elif hasattr(self.model.config, "max_position_embeddings"):
- self.context_len = self.model.config.max_position_embeddings
+ if hasattr(self.model_config, "max_sequence_length"):
+ self.context_len = self.model_config.max_sequence_length
+ elif hasattr(self.model_config, "max_position_embeddings"):
+ self.context_len = self.model_config.max_position_embeddings
  else:
  self.context_len = 2048
 
@@ -121,7 +124,7 @@ def __init__(
 
  # Initialize tokenizer
  use_fast = True
- if isinstance(AutoConfig.from_pretrained(self.config_obj.base_model), LlamaConfig):
+ if isinstance(self.model_config, LlamaConfig):
  # HACK: Llama fast tokenizer takes about 2-4 minutes to load, so we disable it for now.
  use_fast = False
  self.tokenizer = AutoTokenizer.from_pretrained(self.config_obj.base_model, use_fast=use_fast)
@@ -152,16 +155,13 @@ def __init__(
  # because the model has additional "head" layers that are used to predict the next
  # token in the sequence. These head layers can add additional dimensions to the
  # logits tensor, beyond the vocab_size dimension.
- input_size=self.input_shape[-1] if self.output_feature_type == TEXT else self.model.config.vocab_size,
+ input_size=self.input_shape[-1] if self.output_feature_type == TEXT else self.model_config.vocab_size,
  )
  )
 
  # Extract the decoder object for the forward pass
  self._output_feature_decoder = ModuleWrapper(self.output_features.items()[0][1])
 
- # Initialize the PEFT adapter is one is provided
- self.initialize_adapter()
-
  clear_data_cache()
 
  def create_feature_dict(self) -> LudwigFeatureDict:
@@ -193,6 +193,10 @@ def initialize_adapter(self):
  self.model.print_trainable_parameters()
  logger.info("==================================================")
 
+ def prepare_for_training(self):
+ # TODO: this implementation will not work if resuming from a previous checkpoint. Need to fix this.
+ self.initialize_adapter()
+
  def to_device(self, device):
  device = torch.device(device)
 

diff --git a/ludwig/trainers/trainer.py b/ludwig/trainers/trainer.py
@@ -175,6 +175,7 @@ def __init__(
  self.base_learning_rate = base_learning_rate
 
  self.model = model
+ self.model.prepare_for_training()
  self.model = self.distributed.to_device(self.model)
  self.model.metrics_to_device(self.device)
 

diff --git a/ludwig/utils/model_utils.py b/ludwig/utils/model_utils.py
@@ -0,0 +1,76 @@
+from collections import OrderedDict
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+
+NUMPY_TO_TORCH_DTYPE = {
+ bool: torch.bool,
+ np.bool_: torch.bool,
+ np.uint8: torch.uint8,
+ np.int8: torch.int8,
+ np.int16: torch.int16,
+ np.int32: torch.int32,
+ np.int64: torch.int64,
+ np.float16: torch.float16,
+ np.float32: torch.float32,
+ np.float64: torch.float64,
+ np.complex64: torch.complex64,
+ np.complex128: torch.complex128,
+}
+
+
+def extract_tensors(model: torch.nn.Module) -> Tuple[torch.nn.Module, List[Dict]]:
+ """Remove the tensors from a PyTorch model, convert them to NumPy arrays, and return the stripped model and
+ tensors.
+
+ Reference implementation: https://medium.com/ibm-data-ai/how-to-load-pytorch-models-340-times-faster-with-
+ ray-8be751a6944c # noqa
+ """
+
+ tensors = []
+ for _, module in model.named_modules():
+ # Store the tensors as numpy arrays in Python dictionaries
+ # Delete the same tensors since we no longer need them and we want to reduce memory pressure.
+ # This ensures that throughout this process, we keep memory nearly linear w.r.t model parameters.
+ params = OrderedDict()
+ buffers = OrderedDict()
+ for name, param in module.named_parameters(recurse=False):
+ params[name] = torch.clone(param).detach().numpy()
+ del param
+ for name, buf in module.named_buffers(recurse=False):
+ buffers[name] = torch.clone(buf).detach().numpy()
+ del buf
+ tensors.append({"params": params, "buffers": buffers})
+
+ # Strip all tensors and buffers out of the original model.
+ for _, module in model.named_modules():
+ for name in [name for name, _ in module.named_parameters(recurse=False)] + [
+ name for name, _ in module.named_buffers(recurse=False)
+ ]:
+ setattr(module, name, None)
+
+ return model, tensors
+
+
+def replace_tensors(m: torch.nn.Module, tensors: List[Dict], device: torch.device):
+ """Restore the tensors that extract_tensors() stripped out of a PyTorch model. This operation is performed in
+ place.
+
+ Reference implementation: https://medium.com/ibm-data-ai/how-to-load-pytorch-models-340-times-faster-with-
+ ray-8be751a6944c # noqa
+ """
+ modules = [module for _, module in m.named_modules()]
+ for module, tensor_dict in zip(modules, tensors):
+ # There are separate APIs to set parameters and buffers.
+ for name, array in tensor_dict["params"].items():
+ module.register_parameter(
+ name,
+ torch.nn.Parameter(torch.as_tensor(array, device=device, dtype=NUMPY_TO_TORCH_DTYPE.get(array.dtype))),
+ )
+
+ for name, array in tensor_dict["buffers"].items():
+ module.register_buffer(
+ name,
+ torch.as_tensor(array, device=device, dtype=NUMPY_TO_TORCH_DTYPE.get(array.dtype)),
+ )
diff --git a/ludwig/utils/torch_utils.py b/ludwig/utils/torch_utils.py
@@ -183,6 +183,10 @@ def __init__(self):
  def device(self):
  return self.device_tensor.device
 
+ def prepare_for_training(self):
+ """This is called from within the Trainer object to do any final instantiation before model training."""
+ pass
+
  def losses(self):
  collected_losses = []
  for loss in self._losses.values():

diff --git a/requirements.txt b/requirements.txt
@@ -10,9 +10,13 @@ torch>=1.13.0
 torchaudio
 torchtext
 torchvision
-transformers>=4.28.1
+transformers>=4.31.0
+tokenizers>=0.13.3
 spacy>=2.3
-PyYAML>=3.12
+
+# https://github.com/yaml/pyyaml/issues/601
+PyYAML>=3.12,<6.0.1
+
 absl-py
 kaggle
 requests
@@ -24,7 +28,7 @@ marshmallow
 marshmallow-jsonschema
 marshmallow-dataclass==8.5.4
 tensorboard
-torchmetrics<=0.11.4
+torchmetrics>=0.11.0,<=0.11.4
 torchinfo
 filelock
 psutil==5.9.4