HabanaAI · tpawlows · Oct 25, 2024 · Oct 23, 2024 · Oct 24, 2024 · Oct 25, 2024
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -226,9 +226,13 @@ def __init__(
         self.num_expert_group = num_expert_group
         self.topk_group = topk_group
         self.custom_routing_function = custom_routing_function
-        if current_platform.is_hpu():
-            from vllm_hpu_extension.ops import StaticFusedMOE
-            self.hpu_static_fused_moe = StaticFusedMOE(self.num_experts)
+        if is_hpu:
+            from vllm_hpu_extension.ops import DynamicFusedMOE, StaticFusedMOE
+
+            from vllm.model_executor.layers.quantization.inc import INCConfig
+            selected_fused_moe = (StaticFusedMOE if isinstance(
+                quant_config, INCConfig) else DynamicFusedMOE)
+            self.hpu_static_fused_moe = selected_fused_moe(self.num_experts)
 
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = (
@@ -321,8 +325,10 @@ def _load_w13(self,
         expert_data.copy_(loaded_weight)
 
         if is_hpu:
-            self.hpu_static_fused_moe.w13_list[expert_id].set_weight(
-                orig_exp_data)
+            from vllm_hpu_extension.ops import StaticFusedMOE
+            if isinstance(self.hpu_static_fused_moe, StaticFusedMOE):
+                self.hpu_static_fused_moe.w13_list[expert_id].set_weight(
+                    orig_exp_data)
 
     def _load_w2(self,
                  expert_data: torch.Tensor,
@@ -341,8 +347,10 @@ def _load_w2(self,
         # w2, down_proj: Load into only logical weight of w2.
         expert_data.copy_(loaded_weight)
         if is_hpu:
-            self.hpu_static_fused_moe.w2_list[expert_id].set_weight(
-                expert_data)
+            from vllm_hpu_extension.ops import StaticFusedMOE
+            if isinstance(self.hpu_static_fused_moe, StaticFusedMOE):
+                self.hpu_static_fused_moe.w2_list[expert_id].set_weight(
+                    expert_data)
 
     def _load_single_value(self, param: torch.nn.Parameter,
                            loaded_weight: torch.Tensor, expert_id: int):