diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 20f4dc74a3955..4019950062efe 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 457450cda2ce6..8f6bdaa7ab44a 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -226,9 +226,13 @@ def __init__( self.num_expert_group = num_expert_group self.topk_group = topk_group self.custom_routing_function = custom_routing_function - if current_platform.is_hpu(): - from vllm_hpu_extension.ops import StaticFusedMOE - self.hpu_static_fused_moe = StaticFusedMOE(self.num_experts) + if is_hpu: + from vllm_hpu_extension.ops import DynamicFusedMOE, StaticFusedMOE + + from vllm.model_executor.layers.quantization.inc import INCConfig + selected_fused_moe = (StaticFusedMOE if isinstance( + quant_config, INCConfig) else DynamicFusedMOE) + self.hpu_static_fused_moe = selected_fused_moe(self.num_experts) if quant_config is None: self.quant_method: Optional[QuantizeMethodBase] = ( @@ -321,8 +325,10 @@ def _load_w13(self, expert_data.copy_(loaded_weight) if is_hpu: - self.hpu_static_fused_moe.w13_list[expert_id].set_weight( - orig_exp_data) + from vllm_hpu_extension.ops import StaticFusedMOE + if isinstance(self.hpu_static_fused_moe, StaticFusedMOE): + self.hpu_static_fused_moe.w13_list[expert_id].set_weight( + orig_exp_data) def _load_w2(self, expert_data: torch.Tensor, @@ -341,8 +347,10 @@ def _load_w2(self, # w2, down_proj: Load into only logical weight of w2. expert_data.copy_(loaded_weight) if is_hpu: - self.hpu_static_fused_moe.w2_list[expert_id].set_weight( - expert_data) + from vllm_hpu_extension.ops import StaticFusedMOE + if isinstance(self.hpu_static_fused_moe, StaticFusedMOE): + self.hpu_static_fused_moe.w2_list[expert_id].set_weight( + expert_data) def _load_single_value(self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int):