Skip to content

Commit

Permalink
hotfix attn alibi wo head mapping (#496)
Browse files Browse the repository at this point in the history
Co-authored-by: oliveryuan <[email protected]>
  • Loading branch information
Oliver-ss and oliveryuan authored Jul 18, 2023
1 parent 453bafb commit bda41c7
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 0 deletions.
2 changes: 2 additions & 0 deletions tests/kernels/test_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ def run_single_query_cached_kv_attention(
]
block_tables.append(block_table)
block_tables = torch.tensor(block_tables, dtype=torch.int, device='cuda')
head_mapping = torch.arange(num_heads, dtype=torch.int32, device="cuda")

scale = float(1.0 / (head_size**0.5))
output = torch.empty(num_tokens,
Expand All @@ -211,6 +212,7 @@ def run_single_query_cached_kv_attention(
query,
key_cache,
value_cache,
head_mapping,
scale,
block_tables,
context_lens,
Expand Down
1 change: 1 addition & 0 deletions vllm/model_executor/layers/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,7 @@ def single_query_cached_kv_attention(
query,
key_cache,
value_cache,
self.head_mapping,
self.scale,
input_metadata.block_tables,
input_metadata.context_lens,
Expand Down

0 comments on commit bda41c7

Please sign in to comment.