From 56ea91024d1c72a637905bc4e4186f9df3c28587 Mon Sep 17 00:00:00 2001
From: Marceli Fylcek <mfylcek@habana.ai>
Date: Thu, 10 Oct 2024 14:53:26 +0300
Subject: [PATCH 1/8] Contiguous PA POC

---
 vllm/worker/hpu_model_runner.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 382a0abb21240..9ac68b438fdc4 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1018,12 +1018,22 @@ def _prepare_decode(
         block_mapping: List[int] = list(
             itertools.chain.from_iterable(block_mapping_nested))
 
-        last_block = [
-            sl % self.block_size + 1 for sl in itertools.chain(*slot_mapping)
-        ]
-        block_usage = [[self.block_size] * (b_u - 1) + [lb]
-                       for b_u, lb in zip(blocks_used, last_block)]
-        block_usage = list(itertools.chain(*block_usage))
+        max_idx = max(block_list)
+        max_blocks = max(max_idx + 1, len(block_list))
+        block_bucket_size = find_bucket(max_blocks, self.decode_block_bucket_cfg)
+
+        block_mapping = [None] * block_bucket_size
+        block_usage = [None] * block_bucket_size
+        for i, bt in enumerate(block_tables):
+            for b in bt:
+                if block_mapping[b] is None:
+                    block_mapping[b] = i
+                    block_usage[b] = self.block_size
+        block_mapping = [b if b is not None else 0 for b in block_mapping]
+
+        for bt, sl in zip(block_tables, slot_mapping):
+            block_usage[bt[-1]] = sl[-1] % self.block_size + 1
+        block_usage = [u if u is not None else 0 for u in block_usage]                
 
         block_bucket_size = find_bucket(
             len(block_list),

From b03fb6e21874301dcb4b3abbf582178e6cd02b74 Mon Sep 17 00:00:00 2001
From: Marceli Fylcek <mfylcek@habana.ai>
Date: Mon, 14 Oct 2024 17:27:52 +0300
Subject: [PATCH 2/8] Limit block_bucket_size

---
 vllm/worker/hpu_model_runner.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 9ac68b438fdc4..0ab64499a1642 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1021,6 +1021,7 @@ def _prepare_decode(
         max_idx = max(block_list)
         max_blocks = max(max_idx + 1, len(block_list))
         block_bucket_size = find_bucket(max_blocks, self.decode_block_bucket_cfg)
+        block_bucket_size = min(block_bucket_size, self.cache_config.num_gpu_blocks)
 
         block_mapping = [None] * block_bucket_size
         block_usage = [None] * block_bucket_size
@@ -1032,7 +1033,8 @@ def _prepare_decode(
         block_mapping = [b if b is not None else 0 for b in block_mapping]
 
         for bt, sl in zip(block_tables, slot_mapping):
-            block_usage[bt[-1]] = sl[-1] % self.block_size + 1
+            if bt:
+                block_usage[bt[-1]] = sl[-1] % self.block_size + 1
         block_usage = [u if u is not None else 0 for u in block_usage]                
 
         block_bucket_size = find_bucket(

From a490556df90cac5355985ac6c28f2cf270e9e9b4 Mon Sep 17 00:00:00 2001
From: Marceli Fylcek <mfylcek@habana.ai>
Date: Tue, 15 Oct 2024 12:23:15 +0300
Subject: [PATCH 3/8] Warmup for buckets with max_blocks

---
 vllm/worker/hpu_model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 0ab64499a1642..7be9946c627f5 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -199,10 +199,11 @@ def generate_decode_buckets(bs_bucket_config, blocks_bucket_config,
     bs_buckets = warmup_range(bs_bucket_config)
     block_buckets = warmup_range(blocks_bucket_config)
     bmin, bstep, bmax = blocks_bucket_config
-    last_bucket = round_up(max_blocks, bstep)
+    last_bucket = max_blocks
     for bs in bs_buckets:
         for blocks in block_buckets:
             if blocks > last_bucket:
+                buckets.append((bs, last_bucket))    
                 break
             buckets.append((bs, blocks))
     return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))

From 05b3d09742c2ca19b1ac6c4736d750a306844182 Mon Sep 17 00:00:00 2001
From: Marceli Fylcek <mfylcek@habana.ai>
Date: Thu, 17 Oct 2024 10:49:04 +0300
Subject: [PATCH 4/8] block_mapping padding fix

---
 vllm/worker/hpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 7be9946c627f5..96b157f0d0f55 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1031,7 +1031,7 @@ def _prepare_decode(
                 if block_mapping[b] is None:
                     block_mapping[b] = i
                     block_usage[b] = self.block_size
-        block_mapping = [b if b is not None else 0 for b in block_mapping]
+        block_mapping = [b if b is not None else -1 for b in block_mapping]
 
         for bt, sl in zip(block_tables, slot_mapping):
             if bt:

From 3b028cc3a275a56d318064281197e914c00f29f9 Mon Sep 17 00:00:00 2001
From: Marceli Fylcek <mfylcek@habana.ai>
Date: Wed, 23 Oct 2024 16:03:46 +0300
Subject: [PATCH 5/8] New softmax

---
 vllm/worker/hpu_model_runner.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 96b157f0d0f55..c8ece44da4680 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1003,25 +1003,18 @@ def _prepare_decode(
 
         num_decode_tokens = sum(seq_lens)
 
-        blocks_used = [len(bt) for bt in block_tables if bt]
-        block_list = []
+        block_list = list(itertools.chain(*block_tables))
+
         block_scales = []
         for i, bt in enumerate(block_tables):
-            block_list.extend(bt)
             blocks_in_group = len(bt)
             if blocks_in_group > 0:
                 scale = 1.0 / blocks_in_group
                 block_scales.extend([scale] * blocks_in_group)
 
-        block_mapping_nested: List[List[int]] = [
-            [i] * b_u for i, b_u in enumerate(blocks_used)
-        ]
-        block_mapping: List[int] = list(
-            itertools.chain.from_iterable(block_mapping_nested))
-
         max_idx = max(block_list)
         max_blocks = max(max_idx + 1, len(block_list))
-        block_bucket_size = find_bucket(max_blocks, self.decode_block_bucket_cfg)
+        block_bucket_size = find_bucket(max_blocks, self.bucketing_global_state.decode_block_bucket_cfg)
         block_bucket_size = min(block_bucket_size, self.cache_config.num_gpu_blocks)
 
         block_mapping = [None] * block_bucket_size
@@ -1038,9 +1031,6 @@ def _prepare_decode(
                 block_usage[bt[-1]] = sl[-1] % self.block_size + 1
         block_usage = [u if u is not None else 0 for u in block_usage]                
 
-        block_bucket_size = find_bucket(
-            len(block_list),
-            self.bucketing_global_state.decode_block_bucket_cfg)
         block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID)
         block_groups = pad_list(block_mapping, block_bucket_size,
                                 len(block_tables))

From f3488dca248116895370cc16ff5b9bd0abbb2f03 Mon Sep 17 00:00:00 2001
From: Marceli Fylcek <mfylcek@habana.ai>
Date: Thu, 24 Oct 2024 13:19:32 +0300
Subject: [PATCH 6/8] Softmax normalization adjustment

---
 requirements-hpu.txt            |  2 +-
 vllm/worker/hpu_model_runner.py | 30 ++++++++++++++----------------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 7cefa4e631fa8..4719639da6188 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -8,4 +8,4 @@ pandas
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@c2801bb
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@6cb6e19
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index c8ece44da4680..3487fec0f1d5e 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1005,13 +1005,6 @@ def _prepare_decode(
 
         block_list = list(itertools.chain(*block_tables))
 
-        block_scales = []
-        for i, bt in enumerate(block_tables):
-            blocks_in_group = len(bt)
-            if blocks_in_group > 0:
-                scale = 1.0 / blocks_in_group
-                block_scales.extend([scale] * blocks_in_group)
-
         max_idx = max(block_list)
         max_blocks = max(max_idx + 1, len(block_list))
         block_bucket_size = find_bucket(max_blocks, self.bucketing_global_state.decode_block_bucket_cfg)
@@ -1019,25 +1012,30 @@ def _prepare_decode(
 
         block_mapping = [None] * block_bucket_size
         block_usage = [None] * block_bucket_size
+        block_scales = [None] * block_bucket_size
+
         for i, bt in enumerate(block_tables):
-            for b in bt:
-                if block_mapping[b] is None:
-                    block_mapping[b] = i
-                    block_usage[b] = self.block_size
+            if bt:
+                blocks_in_group = len(bt)
+                scale = 1.0 / blocks_in_group
+                for b in bt:
+                    if block_mapping[b] is None:
+                        block_mapping[b] = i
+                        block_usage[b] = self.block_size
+                        block_scales[b] = scale
+
         block_mapping = [b if b is not None else -1 for b in block_mapping]
+        block_scales = [b if b is not None else 0.0 for b in block_scales]
 
         for bt, sl in zip(block_tables, slot_mapping):
             if bt:
                 block_usage[bt[-1]] = sl[-1] % self.block_size + 1
-        block_usage = [u if u is not None else 0 for u in block_usage]                
+        block_usage = [u if u is not None else 1 for u in block_usage]                
 
         block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID)
         block_groups = pad_list(block_mapping, block_bucket_size,
                                 len(block_tables))
-        block_mapping = pad_list(block_mapping, block_bucket_size, -1)
-        block_usage = pad_list(block_usage, block_bucket_size, 1)
-        block_scales = pad_list(block_scales, block_bucket_size, 0.0)
-
+        
         block_list = torch.tensor(block_list,
                                   dtype=torch.int,
                                   device=self.device)

From e8dfc9e26ca0457eee244fde23f9d63f65bc8e24 Mon Sep 17 00:00:00 2001
From: Marceli Fylcek <mfylcek@habana.ai>
Date: Fri, 25 Oct 2024 11:09:48 +0300
Subject: [PATCH 7/8] Formatting

---
 vllm/worker/hpu_model_runner.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 3487fec0f1d5e..2760f1e6a31fa 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1007,8 +1007,12 @@ def _prepare_decode(
 
         max_idx = max(block_list)
         max_blocks = max(max_idx + 1, len(block_list))
-        block_bucket_size = find_bucket(max_blocks, self.bucketing_global_state.decode_block_bucket_cfg)
-        block_bucket_size = min(block_bucket_size, self.cache_config.num_gpu_blocks)
+        block_bucket_size = find_bucket(
+            max_blocks, self.bucketing_global_state.decode_block_bucket_cfg
+        )
+        block_bucket_size = min(
+            block_bucket_size, self.cache_config.num_gpu_blocks
+        )
 
         block_mapping = [None] * block_bucket_size
         block_usage = [None] * block_bucket_size
@@ -1030,7 +1034,7 @@ def _prepare_decode(
         for bt, sl in zip(block_tables, slot_mapping):
             if bt:
                 block_usage[bt[-1]] = sl[-1] % self.block_size + 1
-        block_usage = [u if u is not None else 1 for u in block_usage]                
+        block_usage = [u if u is not None else 1 for u in block_usage]
 
         block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID)
         block_groups = pad_list(block_mapping, block_bucket_size,

From e209acf2dbc9ec62d347e9e0ead1bbbfcf7d034d Mon Sep 17 00:00:00 2001
From: Marceli Fylcek <mfylcek@habana.ai>
Date: Fri, 25 Oct 2024 12:04:47 +0300
Subject: [PATCH 8/8] Type annotations

---
 vllm/worker/hpu_model_runner.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 2760f1e6a31fa..4be0dc1a1abd8 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -203,7 +203,7 @@ def generate_decode_buckets(bs_bucket_config, blocks_bucket_config,
     for bs in bs_buckets:
         for blocks in block_buckets:
             if blocks > last_bucket:
-                buckets.append((bs, last_bucket))    
+                buckets.append((bs, last_bucket))
                 break
             buckets.append((bs, blocks))
     return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
@@ -1008,15 +1008,13 @@ def _prepare_decode(
         max_idx = max(block_list)
         max_blocks = max(max_idx + 1, len(block_list))
         block_bucket_size = find_bucket(
-            max_blocks, self.bucketing_global_state.decode_block_bucket_cfg
-        )
-        block_bucket_size = min(
-            block_bucket_size, self.cache_config.num_gpu_blocks
-        )
+            max_blocks, self.bucketing_global_state.decode_block_bucket_cfg)
+        block_bucket_size = min(block_bucket_size,
+                                self.cache_config.num_gpu_blocks)
 
-        block_mapping = [None] * block_bucket_size
-        block_usage = [None] * block_bucket_size
-        block_scales = [None] * block_bucket_size
+        block_mapping: List[Union[None, int]] = [None] * block_bucket_size
+        block_usage: List[Union[None, int]] = [None] * block_bucket_size
+        block_scales: List[Union[None, float]] = [None] * block_bucket_size
 
         for i, bt in enumerate(block_tables):
             if bt:
@@ -1039,7 +1037,6 @@ def _prepare_decode(
         block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID)
         block_groups = pad_list(block_mapping, block_bucket_size,
                                 len(block_tables))
-        
         block_list = torch.tensor(block_list,
                                   dtype=torch.int,
                                   device=self.device)