diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 18ce8d7f7d164..a64edc94825f3 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -3,6 +3,7 @@ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# Copyright 2024 Habana Labs, Ltd. an Intel Company # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX # and OPT implementations in this library. It has been modified from its @@ -420,6 +421,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + if is_hpu: + torch.hpu.synchronize() # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should