Fix performance of top_p and top_k calculations

HabanaAI · Oct 30, 2024 · bc6e304 · bc6e304
1 parent 94858b5
commit bc6e304
Showing 1 changed file with 6 additions and 6 deletions.
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
@@ -267,12 +267,12 @@ def forward(
 
         if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None:
             # If we have a scalar p and k, we can use the optimized version.
-            logits = torch.where(
-                self._scalar_p_and_k,
-                self._apply_top_k_top_p_opt(logits, self._top_p_scalar,
-                                            self._top_k_scalar),
-                _apply_top_k_top_p(logits, sampling_tensors.top_ps,
-                                   sampling_tensors.top_ks))
+            if self._scalar_p_and_k.any():
+                logits = self._apply_top_k_top_p_opt(logits, self._top_p_scalar.item(),
+                                                     self._top_k_scalar.item())
+            else:
+                logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
+                                            sampling_tensors.top_ks)
 
         if do_min_p:
             logits = _apply_min_p(logits, sampling_tensors.min_ps)