Skip to content

Commit

Permalink
Add DRAM pre-fetcher micro benchmark (#14850)
Browse files Browse the repository at this point in the history
* #14512: Add DRAM pre-fetcher benchmark (#14528)

* #14512: Add second core coords for GS and BH

* #14512: Update BW target for GS

* #14512: Remove wrong assert
  • Loading branch information
johanna-rock-tt authored Nov 7, 2024
1 parent 74c4dea commit 0d66408
Show file tree
Hide file tree
Showing 3 changed files with 230 additions and 96 deletions.
74 changes: 63 additions & 11 deletions tests/scripts/test_moreh_microbenchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,6 +684,7 @@ def test_matmul_single_core_sharded(
[
("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 0, 12, 0),
("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 1, 12, 0),
("wormhole_b0", 1000, np.array([2048, 3840]), 1, 4, 1, 12, 0), # Padded FF1 shapes for llama 70b on TG
],
)
def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
Expand Down Expand Up @@ -722,29 +723,78 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form


@pytest.mark.parametrize(
"arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id",
"arch, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id, bw_target",
[
("grayskull", 1202, np.array([32768 * 2, 8 * 128]), 1, 64, 1, 8, 0),
("wormhole_b0", 1000, np.array([32768 * 2, 12 * 128]), 1, 64, 1, 12, 0),
("blackhole", 800, np.array([32768 * 8, 8 * 128]), 1, 256, 1, 8, 0),
("grayskull", np.array([32768 * 2, 8 * 128]), 1, 64, 1, 8, 0, None),
("wormhole_b0", np.array([32768 * 2, 12 * 128]), 1, 64, 2, 12, 0, None),
("blackhole", np.array([32768 * 8, 8 * 128]), 1, 256, 2, 8, 0, None),
# FF1/FF3 shapes for TG llama 70b
(
"wormhole_b0",
np.array([2048, 3840]),
1,
16,
0,
12,
0,
240,
), # 244 GB/s
# FF2 shapes for TG llama 70b
(
"wormhole_b0",
np.array([3584, 2304]),
1,
28,
1,
12,
0,
250,
), # 255 GB/s
# Dense Out shapes for TG llama 70b
(
"wormhole_b0",
np.array([1024, 2304]),
1,
8,
1,
12,
0,
220,
), # 226 GB/s
# QKV shapes for TG llama 70b
(
"wormhole_b0",
np.array([2048, 1536]),
1,
16,
1,
12,
0,
225,
), # 232 GB/s
],
)
def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
def test_dram_read_l1_write_core(
arch, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id, bw_target
):
dev_freq = get_device_freq()
data = []
cycle_list = []
time_list = []
throughput_list = []
for _ in range(num_tests):
k = int(test_vector[0])
n = int(test_vector[1])
if data_format == 0:
if data_format == 0: # BFP4
input_size = k * n * (512 + 64) // 1024
elif data_format == 1: # BFP8
input_size = k * n * 1088 // 1024
elif data_format == 1:
elif data_format == 2: # BFLOAT16
input_size = k * n * 2048 // 1024
run_dram_read_l1_write_cmd(k, n, nblock, data_format, num_banks, bank_start_id)
cycle = profile_results_kernel_duration()
time = cycle / freq / 1000.0 / 1000.0
throughput = input_size / cycle * freq / 1000.0
time = cycle / dev_freq / 1000.0 / 1000.0
throughput = input_size / cycle * dev_freq / 1000.0
cycle_list.append(cycle)
time_list.append(time)
throughput_list.append(throughput)
Expand All @@ -756,13 +806,15 @@ def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, dat
logger.info("DRAM read throughput: " + str(throughput))
data.append([throughput])
# check within range
dev_freq = get_device_freq()
if arch == "grayskull":
bw_bound = 100.0
bw_bound = 70.0 # Equals 85 GB/s with 1200 MHz
elif arch == "wormhole_b0":
bw_bound = 260.0
elif arch == "blackhole":
bw_bound = 340.0
if bw_target is not None:
bw_bound = bw_target
bw_bound = bw_bound * dev_freq / 1000.0 # Adjust for device frequency; target is based on max device frequency
assert bw_bound <= throughput


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,40 +11,55 @@

void kernel_main() {
constexpr uint32_t num_blocks = get_compile_time_arg_val(0);
constexpr uint32_t num_pages = get_compile_time_arg_val(1);
constexpr uint32_t block_num_tiles = get_compile_time_arg_val(2);
constexpr uint32_t page_size = get_compile_time_arg_val(3);
constexpr uint32_t noc = get_compile_time_arg_val(4);
constexpr uint32_t num_pages_w_per_receiver = get_compile_time_arg_val(1);
constexpr uint32_t num_tiles_h = get_compile_time_arg_val(2);
constexpr uint32_t block_num_tiles = get_compile_time_arg_val(3);
constexpr uint32_t page_size = get_compile_time_arg_val(4);
constexpr uint32_t noc = get_compile_time_arg_val(5);

const uint32_t vc = get_arg_val<uint32_t>(0);
const uint32_t noc_x = get_arg_val<uint32_t>(1);
const uint32_t noc_y = get_arg_val<uint32_t>(2);
// First L1 writer core coordinates
const uint32_t noc_x1 = get_arg_val<uint32_t>(1);
const uint32_t noc_y1 = get_arg_val<uint32_t>(2);
// Second L1 writer core coordinates
const uint32_t noc_x2 = get_arg_val<uint32_t>(3);
const uint32_t noc_y2 = get_arg_val<uint32_t>(4);

constexpr uint32_t cb_id = 0;

uint32_t l1_write_addr = get_write_ptr(cb_id);
const uint64_t l1_noc_write_addr = get_noc_addr(noc_x, noc_y, l1_write_addr, noc);
const uint64_t l1_noc_write_addr1 = get_noc_addr(noc_x1, noc_y1, l1_write_addr, noc);
const uint64_t l1_noc_write_addr2 = get_noc_addr(noc_x2, noc_y2, l1_write_addr, noc);

noc_async_write_one_packet_set_state(l1_noc_write_addr, page_size, noc, vc);

for (uint32_t block = 0; block < num_blocks; ++block) {

auto remote_l1_write_addr = l1_noc_write_addr;
for (uint32_t block = 0; block < num_blocks; ++block) { // Iterate over blocks

cb_wait_front(cb_id, block_num_tiles);
auto l1_read_addr = get_read_ptr(cb_id);

for (uint32_t h = 0; h < num_pages; ++h) {
noc_async_write_one_packet_with_state(l1_read_addr, remote_l1_write_addr, noc);
l1_read_addr += page_size;
remote_l1_write_addr += page_size;
for (uint32_t core_id = 0; core_id < 2; ++core_id) { // Iterate over two neighboring cores
uint64_t l1_noc_write_addr_for_receiver_core = 0;
uint32_t l1_read_addr = get_read_ptr(cb_id);
if (core_id == 0) {
l1_noc_write_addr_for_receiver_core = l1_noc_write_addr1; // Set write pointer to start of cb for first core
} else {
l1_noc_write_addr_for_receiver_core = l1_noc_write_addr2; // Set write pointer to start of cb for second core
l1_read_addr += page_size * num_pages_w_per_receiver; // Stride read pointer to start of second core
}

noc_async_write_one_packet_set_state(l1_noc_write_addr_for_receiver_core, page_size, noc, vc); // Set state to write a page to noc/vc

for (uint32_t h = 0; h < num_tiles_h; ++h) { // Iterate over page rows per receiver core
for (uint32_t w = 0; w < num_pages_w_per_receiver; ++w) { // Iterate over page columns per receiver core
noc_async_write_one_packet_with_state(l1_read_addr, l1_noc_write_addr_for_receiver_core, noc);
l1_read_addr += page_size;
l1_noc_write_addr_for_receiver_core += page_size;
}
l1_read_addr += page_size * num_pages_w_per_receiver; // Stride read pointer over other core's data
}
}

noc_async_write_barrier(noc);

cb_pop_front(cb_id, block_num_tiles);

}


}
Loading

0 comments on commit 0d66408

Please sign in to comment.