diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f8b9c48bc9589..a85b746297315 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -854,9 +854,6 @@ def _process_model_outputs( request_outputs.append(request_output) return request_outputs - def finish_measurements(self): - self.model_executor.finish_measurements() - def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index fc9f118ff14b2..62309ed345b1d 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -173,9 +173,6 @@ def set_tokenizer( self.llm_engine.tokenizer.tokenizer = get_cached_tokenizer( tokenizer) - def finish_measurements(self): - self.llm_engine.finish_measurements() - @overload # LEGACY: single (prompt + optional token ids) def generate( self, diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index 80f8037a2d043..6b97fb2e381af 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -90,9 +90,6 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: msg = f"init_cache_engine took {cache_init_m.get_summary_string()}" logger.info(msg) - def finish_measurements(self): - self.driver_worker.finish_measurements() - def execute_model( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index 17e3414a96b57..9e0a89cbeb8aa 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -237,9 +237,6 @@ def _driver_execute_model( return self.driver_worker.execute_method("execute_model", execute_model_req) - def finish_measurements(self): - self._run_workers("finish_measurements") - def execute_model( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 72aba42ae8553..ea8b05f80ab3f 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1503,4 +1503,5 @@ def shutdown_inc(self): print('inc shutdown') def __del__(self): + self.finish_measurements() self.shutdown_inc() diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 87122c03d3c8f..6afa88a6238f5 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -223,9 +223,6 @@ def _warm_up_model(self) -> None: # the model initialization and profiling. set_random_seed(self.model_config.seed) - def finish_measurements(self): - self.model_runner.finish_measurements() - @property def do_metadata_broadcast(self) -> bool: return self.parallel_config.tensor_parallel_size > 1