From 6d7e1fad7b028613baa98eb36180c8e58c60c711 Mon Sep 17 00:00:00 2001
From: Yang Wang <y3wang@habana.ai>
Date: Fri, 25 Oct 2024 03:09:18 +0300
Subject: [PATCH] remove unnecessary change

---
 benchmarks/benchmark_throughput.py | 47 +++++++++++++-----------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index a8e85b71a5441..e1a359b871e71 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -133,40 +133,33 @@ def run_vllm(
         sampling_params.append(
             SamplingParams(
                 n=n,
-                temperature=0.0,
+                temperature=1.0,
                 top_p=1.0,
                 ignore_eos=True,
                 max_tokens=output_len,
             ))
-    # from vllm.utils import Device
-    # for i in range(5):
-    #     start = time.perf_counter()
-    #     llm.generate(prompts, sampling_params, use_tqdm=True)
-    #     end = time.perf_counter()
-        # print(llm.llm_engine.scheduler[0].block_manager.block_allocator._allocators[Device.GPU]._free_block_indices)
 
     use_beam_search = False
 
-    for i in range(3):
-        if not use_beam_search:
-            start = time.perf_counter()
-            llm.generate(prompts, sampling_params, use_tqdm=True)
-            end = time.perf_counter()
-        else:
-            prompts = [prompt for prompt, _, _ in requests]
-            # output_len should be the same for all requests.
-            output_len = requests[0][2]
-            for prompt, input_len, _output_len in requests:
-                assert _output_len == output_len
-            start = time.perf_counter()
-            llm.beam_search(
-                prompts,
-                BeamSearchParams(
-                    beam_width=n,
-                    max_tokens=output_len,
-                    ignore_eos=True,
-                ))
-            end = time.perf_counter()
+    if not use_beam_search:
+        start = time.perf_counter()
+        llm.generate(prompts, sampling_params, use_tqdm=True)
+        end = time.perf_counter()
+    else:
+        prompts = [prompt for prompt, _, _ in requests]
+        # output_len should be the same for all requests.
+        output_len = requests[0][2]
+        for prompt, input_len, _output_len in requests:
+            assert _output_len == output_len
+        start = time.perf_counter()
+        llm.beam_search(
+            prompts,
+            BeamSearchParams(
+                beam_width=n,
+                max_tokens=output_len,
+                ignore_eos=True,
+            ))
+        end = time.perf_counter()
     return end - start