From 56ea91024d1c72a637905bc4e4186f9df3c28587 Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Thu, 10 Oct 2024 14:53:26 +0300 Subject: [PATCH 1/8] Contiguous PA POC --- vllm/worker/hpu_model_runner.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 382a0abb21240..9ac68b438fdc4 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1018,12 +1018,22 @@ def _prepare_decode( block_mapping: List[int] = list( itertools.chain.from_iterable(block_mapping_nested)) - last_block = [ - sl % self.block_size + 1 for sl in itertools.chain(*slot_mapping) - ] - block_usage = [[self.block_size] * (b_u - 1) + [lb] - for b_u, lb in zip(blocks_used, last_block)] - block_usage = list(itertools.chain(*block_usage)) + max_idx = max(block_list) + max_blocks = max(max_idx + 1, len(block_list)) + block_bucket_size = find_bucket(max_blocks, self.decode_block_bucket_cfg) + + block_mapping = [None] * block_bucket_size + block_usage = [None] * block_bucket_size + for i, bt in enumerate(block_tables): + for b in bt: + if block_mapping[b] is None: + block_mapping[b] = i + block_usage[b] = self.block_size + block_mapping = [b if b is not None else 0 for b in block_mapping] + + for bt, sl in zip(block_tables, slot_mapping): + block_usage[bt[-1]] = sl[-1] % self.block_size + 1 + block_usage = [u if u is not None else 0 for u in block_usage] block_bucket_size = find_bucket( len(block_list), From b03fb6e21874301dcb4b3abbf582178e6cd02b74 Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Mon, 14 Oct 2024 17:27:52 +0300 Subject: [PATCH 2/8] Limit block_bucket_size --- vllm/worker/hpu_model_runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 9ac68b438fdc4..0ab64499a1642 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1021,6 +1021,7 @@ def _prepare_decode( max_idx = max(block_list) max_blocks = max(max_idx + 1, len(block_list)) block_bucket_size = find_bucket(max_blocks, self.decode_block_bucket_cfg) + block_bucket_size = min(block_bucket_size, self.cache_config.num_gpu_blocks) block_mapping = [None] * block_bucket_size block_usage = [None] * block_bucket_size @@ -1032,7 +1033,8 @@ def _prepare_decode( block_mapping = [b if b is not None else 0 for b in block_mapping] for bt, sl in zip(block_tables, slot_mapping): - block_usage[bt[-1]] = sl[-1] % self.block_size + 1 + if bt: + block_usage[bt[-1]] = sl[-1] % self.block_size + 1 block_usage = [u if u is not None else 0 for u in block_usage] block_bucket_size = find_bucket( From a490556df90cac5355985ac6c28f2cf270e9e9b4 Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Tue, 15 Oct 2024 12:23:15 +0300 Subject: [PATCH 3/8] Warmup for buckets with max_blocks --- vllm/worker/hpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 0ab64499a1642..7be9946c627f5 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -199,10 +199,11 @@ def generate_decode_buckets(bs_bucket_config, blocks_bucket_config, bs_buckets = warmup_range(bs_bucket_config) block_buckets = warmup_range(blocks_bucket_config) bmin, bstep, bmax = blocks_bucket_config - last_bucket = round_up(max_blocks, bstep) + last_bucket = max_blocks for bs in bs_buckets: for blocks in block_buckets: if blocks > last_bucket: + buckets.append((bs, last_bucket)) break buckets.append((bs, blocks)) return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) From 05b3d09742c2ca19b1ac6c4736d750a306844182 Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Thu, 17 Oct 2024 10:49:04 +0300 Subject: [PATCH 4/8] block_mapping padding fix --- vllm/worker/hpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 7be9946c627f5..96b157f0d0f55 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1031,7 +1031,7 @@ def _prepare_decode( if block_mapping[b] is None: block_mapping[b] = i block_usage[b] = self.block_size - block_mapping = [b if b is not None else 0 for b in block_mapping] + block_mapping = [b if b is not None else -1 for b in block_mapping] for bt, sl in zip(block_tables, slot_mapping): if bt: From 3b028cc3a275a56d318064281197e914c00f29f9 Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Wed, 23 Oct 2024 16:03:46 +0300 Subject: [PATCH 5/8] New softmax --- vllm/worker/hpu_model_runner.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 96b157f0d0f55..c8ece44da4680 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1003,25 +1003,18 @@ def _prepare_decode( num_decode_tokens = sum(seq_lens) - blocks_used = [len(bt) for bt in block_tables if bt] - block_list = [] + block_list = list(itertools.chain(*block_tables)) + block_scales = [] for i, bt in enumerate(block_tables): - block_list.extend(bt) blocks_in_group = len(bt) if blocks_in_group > 0: scale = 1.0 / blocks_in_group block_scales.extend([scale] * blocks_in_group) - block_mapping_nested: List[List[int]] = [ - [i] * b_u for i, b_u in enumerate(blocks_used) - ] - block_mapping: List[int] = list( - itertools.chain.from_iterable(block_mapping_nested)) - max_idx = max(block_list) max_blocks = max(max_idx + 1, len(block_list)) - block_bucket_size = find_bucket(max_blocks, self.decode_block_bucket_cfg) + block_bucket_size = find_bucket(max_blocks, self.bucketing_global_state.decode_block_bucket_cfg) block_bucket_size = min(block_bucket_size, self.cache_config.num_gpu_blocks) block_mapping = [None] * block_bucket_size @@ -1038,9 +1031,6 @@ def _prepare_decode( block_usage[bt[-1]] = sl[-1] % self.block_size + 1 block_usage = [u if u is not None else 0 for u in block_usage] - block_bucket_size = find_bucket( - len(block_list), - self.bucketing_global_state.decode_block_bucket_cfg) block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID) block_groups = pad_list(block_mapping, block_bucket_size, len(block_tables)) From f3488dca248116895370cc16ff5b9bd0abbb2f03 Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Thu, 24 Oct 2024 13:19:32 +0300 Subject: [PATCH 6/8] Softmax normalization adjustment --- requirements-hpu.txt | 2 +- vllm/worker/hpu_model_runner.py | 30 ++++++++++++++---------------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 7cefa4e631fa8..4719639da6188 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@c2801bb +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@6cb6e19 diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index c8ece44da4680..3487fec0f1d5e 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1005,13 +1005,6 @@ def _prepare_decode( block_list = list(itertools.chain(*block_tables)) - block_scales = [] - for i, bt in enumerate(block_tables): - blocks_in_group = len(bt) - if blocks_in_group > 0: - scale = 1.0 / blocks_in_group - block_scales.extend([scale] * blocks_in_group) - max_idx = max(block_list) max_blocks = max(max_idx + 1, len(block_list)) block_bucket_size = find_bucket(max_blocks, self.bucketing_global_state.decode_block_bucket_cfg) @@ -1019,25 +1012,30 @@ def _prepare_decode( block_mapping = [None] * block_bucket_size block_usage = [None] * block_bucket_size + block_scales = [None] * block_bucket_size + for i, bt in enumerate(block_tables): - for b in bt: - if block_mapping[b] is None: - block_mapping[b] = i - block_usage[b] = self.block_size + if bt: + blocks_in_group = len(bt) + scale = 1.0 / blocks_in_group + for b in bt: + if block_mapping[b] is None: + block_mapping[b] = i + block_usage[b] = self.block_size + block_scales[b] = scale + block_mapping = [b if b is not None else -1 for b in block_mapping] + block_scales = [b if b is not None else 0.0 for b in block_scales] for bt, sl in zip(block_tables, slot_mapping): if bt: block_usage[bt[-1]] = sl[-1] % self.block_size + 1 - block_usage = [u if u is not None else 0 for u in block_usage] + block_usage = [u if u is not None else 1 for u in block_usage] block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID) block_groups = pad_list(block_mapping, block_bucket_size, len(block_tables)) - block_mapping = pad_list(block_mapping, block_bucket_size, -1) - block_usage = pad_list(block_usage, block_bucket_size, 1) - block_scales = pad_list(block_scales, block_bucket_size, 0.0) - + block_list = torch.tensor(block_list, dtype=torch.int, device=self.device) From e8dfc9e26ca0457eee244fde23f9d63f65bc8e24 Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Fri, 25 Oct 2024 11:09:48 +0300 Subject: [PATCH 7/8] Formatting --- vllm/worker/hpu_model_runner.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 3487fec0f1d5e..2760f1e6a31fa 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1007,8 +1007,12 @@ def _prepare_decode( max_idx = max(block_list) max_blocks = max(max_idx + 1, len(block_list)) - block_bucket_size = find_bucket(max_blocks, self.bucketing_global_state.decode_block_bucket_cfg) - block_bucket_size = min(block_bucket_size, self.cache_config.num_gpu_blocks) + block_bucket_size = find_bucket( + max_blocks, self.bucketing_global_state.decode_block_bucket_cfg + ) + block_bucket_size = min( + block_bucket_size, self.cache_config.num_gpu_blocks + ) block_mapping = [None] * block_bucket_size block_usage = [None] * block_bucket_size @@ -1030,7 +1034,7 @@ def _prepare_decode( for bt, sl in zip(block_tables, slot_mapping): if bt: block_usage[bt[-1]] = sl[-1] % self.block_size + 1 - block_usage = [u if u is not None else 1 for u in block_usage] + block_usage = [u if u is not None else 1 for u in block_usage] block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID) block_groups = pad_list(block_mapping, block_bucket_size, From e209acf2dbc9ec62d347e9e0ead1bbbfcf7d034d Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Fri, 25 Oct 2024 12:04:47 +0300 Subject: [PATCH 8/8] Type annotations --- vllm/worker/hpu_model_runner.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 2760f1e6a31fa..4be0dc1a1abd8 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -203,7 +203,7 @@ def generate_decode_buckets(bs_bucket_config, blocks_bucket_config, for bs in bs_buckets: for blocks in block_buckets: if blocks > last_bucket: - buckets.append((bs, last_bucket)) + buckets.append((bs, last_bucket)) break buckets.append((bs, blocks)) return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) @@ -1008,15 +1008,13 @@ def _prepare_decode( max_idx = max(block_list) max_blocks = max(max_idx + 1, len(block_list)) block_bucket_size = find_bucket( - max_blocks, self.bucketing_global_state.decode_block_bucket_cfg - ) - block_bucket_size = min( - block_bucket_size, self.cache_config.num_gpu_blocks - ) + max_blocks, self.bucketing_global_state.decode_block_bucket_cfg) + block_bucket_size = min(block_bucket_size, + self.cache_config.num_gpu_blocks) - block_mapping = [None] * block_bucket_size - block_usage = [None] * block_bucket_size - block_scales = [None] * block_bucket_size + block_mapping: List[Union[None, int]] = [None] * block_bucket_size + block_usage: List[Union[None, int]] = [None] * block_bucket_size + block_scales: List[Union[None, float]] = [None] * block_bucket_size for i, bt in enumerate(block_tables): if bt: @@ -1039,7 +1037,6 @@ def _prepare_decode( block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID) block_groups = pad_list(block_mapping, block_bucket_size, len(block_tables)) - block_list = torch.tensor(block_list, dtype=torch.int, device=self.device)