Add DRAM pre-fetcher micro benchmark (#14850)

* #14512: Add DRAM pre-fetcher benchmark (#14528) * #14512: Add second core coords for GS and BH * #14512: Update BW target for GS * #14512: Remove wrong assert
tenstorrent · Nov 7, 2024 · 0d66408 · 0d66408
1 parent 74c4dea
commit 0d66408
Show file tree

Hide file tree

Showing 3 changed files with 230 additions and 96 deletions.
diff --git a/tests/scripts/test_moreh_microbenchmark.py b/tests/scripts/test_moreh_microbenchmark.py
@@ -684,6 +684,7 @@ def test_matmul_single_core_sharded(
     [
         ("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 0, 12, 0),
         ("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 1, 12, 0),
+        ("wormhole_b0", 1000, np.array([2048, 3840]), 1, 4, 1, 12, 0),  # Padded FF1 shapes for llama 70b on TG
     ],
 )
 def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
@@ -722,29 +723,78 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form
 
 
 @pytest.mark.parametrize(
-    "arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id",
+    "arch, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id, bw_target",
     [
-        ("grayskull", 1202, np.array([32768 * 2, 8 * 128]), 1, 64, 1, 8, 0),
-        ("wormhole_b0", 1000, np.array([32768 * 2, 12 * 128]), 1, 64, 1, 12, 0),
-        ("blackhole", 800, np.array([32768 * 8, 8 * 128]), 1, 256, 1, 8, 0),
+        ("grayskull", np.array([32768 * 2, 8 * 128]), 1, 64, 1, 8, 0, None),
+        ("wormhole_b0", np.array([32768 * 2, 12 * 128]), 1, 64, 2, 12, 0, None),
+        ("blackhole", np.array([32768 * 8, 8 * 128]), 1, 256, 2, 8, 0, None),
+        # FF1/FF3 shapes for TG llama 70b
+        (
+            "wormhole_b0",
+            np.array([2048, 3840]),
+            1,
+            16,
+            0,
+            12,
+            0,
+            240,
+        ),  # 244 GB/s
+        # FF2 shapes for TG llama 70b
+        (
+            "wormhole_b0",
+            np.array([3584, 2304]),
+            1,
+            28,
+            1,
+            12,
+            0,
+            250,
+        ),  # 255 GB/s
+        # Dense Out shapes for TG llama 70b
+        (
+            "wormhole_b0",
+            np.array([1024, 2304]),
+            1,
+            8,
+            1,
+            12,
+            0,
+            220,
+        ),  # 226 GB/s
+        # QKV shapes for TG llama 70b
+        (
+            "wormhole_b0",
+            np.array([2048, 1536]),
+            1,
+            16,
+            1,
+            12,
+            0,
+            225,
+        ),  # 232 GB/s
     ],
 )
-def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
+def test_dram_read_l1_write_core(
+    arch, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id, bw_target
+):
+    dev_freq = get_device_freq()
     data = []
     cycle_list = []
     time_list = []
     throughput_list = []
     for _ in range(num_tests):
         k = int(test_vector[0])
         n = int(test_vector[1])
-        if data_format == 0:
+        if data_format == 0:  # BFP4
+            input_size = k * n * (512 + 64) // 1024
+        elif data_format == 1:  # BFP8
             input_size = k * n * 1088 // 1024
-        elif data_format == 1:
+        elif data_format == 2:  # BFLOAT16
             input_size = k * n * 2048 // 1024
         run_dram_read_l1_write_cmd(k, n, nblock, data_format, num_banks, bank_start_id)
         cycle = profile_results_kernel_duration()
-        time = cycle / freq / 1000.0 / 1000.0
-        throughput = input_size / cycle * freq / 1000.0
+        time = cycle / dev_freq / 1000.0 / 1000.0
+        throughput = input_size / cycle * dev_freq / 1000.0
         cycle_list.append(cycle)
         time_list.append(time)
         throughput_list.append(throughput)
@@ -756,13 +806,15 @@ def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, dat
     logger.info("DRAM read throughput: " + str(throughput))
     data.append([throughput])
     # check within range
-    dev_freq = get_device_freq()
     if arch == "grayskull":
-        bw_bound = 100.0
+        bw_bound = 70.0  # Equals 85 GB/s with 1200 MHz
     elif arch == "wormhole_b0":
         bw_bound = 260.0
     elif arch == "blackhole":
         bw_bound = 340.0
+    if bw_target is not None:
+        bw_bound = bw_target
+    bw_bound = bw_bound * dev_freq / 1000.0  # Adjust for device frequency; target is based on max device frequency
     assert bw_bound <= throughput
 
 

diff --git a/...l/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/writer_l1.cpp b/...l/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/writer_l1.cpp
@@ -11,40 +11,55 @@
 
 void kernel_main() {
     constexpr uint32_t num_blocks = get_compile_time_arg_val(0);
-    constexpr uint32_t num_pages = get_compile_time_arg_val(1);
-    constexpr uint32_t block_num_tiles = get_compile_time_arg_val(2);
-    constexpr uint32_t page_size = get_compile_time_arg_val(3);
-    constexpr uint32_t noc = get_compile_time_arg_val(4);
+    constexpr uint32_t num_pages_w_per_receiver = get_compile_time_arg_val(1);
+    constexpr uint32_t num_tiles_h = get_compile_time_arg_val(2);
+    constexpr uint32_t block_num_tiles = get_compile_time_arg_val(3);
+    constexpr uint32_t page_size = get_compile_time_arg_val(4);
+    constexpr uint32_t noc = get_compile_time_arg_val(5);
 
     const uint32_t vc = get_arg_val<uint32_t>(0);
-    const uint32_t noc_x = get_arg_val<uint32_t>(1);
-    const uint32_t noc_y = get_arg_val<uint32_t>(2);
+    // First L1 writer core coordinates
+    const uint32_t noc_x1 = get_arg_val<uint32_t>(1);
+    const uint32_t noc_y1 = get_arg_val<uint32_t>(2);
+    // Second L1 writer core coordinates
+    const uint32_t noc_x2 = get_arg_val<uint32_t>(3);
+    const uint32_t noc_y2 = get_arg_val<uint32_t>(4);
 
     constexpr uint32_t cb_id = 0;
 
     uint32_t l1_write_addr = get_write_ptr(cb_id);
-    const uint64_t l1_noc_write_addr = get_noc_addr(noc_x, noc_y, l1_write_addr, noc);
+    const uint64_t l1_noc_write_addr1 = get_noc_addr(noc_x1, noc_y1, l1_write_addr, noc);
+    const uint64_t l1_noc_write_addr2 = get_noc_addr(noc_x2, noc_y2, l1_write_addr, noc);
 
-    noc_async_write_one_packet_set_state(l1_noc_write_addr, page_size, noc, vc);
-
-    for (uint32_t block = 0; block < num_blocks; ++block) {
-
-        auto remote_l1_write_addr = l1_noc_write_addr;
+    for (uint32_t block = 0; block < num_blocks; ++block) { // Iterate over blocks
 
         cb_wait_front(cb_id, block_num_tiles);
-        auto l1_read_addr = get_read_ptr(cb_id);
 
-        for (uint32_t h = 0; h < num_pages; ++h) {
-            noc_async_write_one_packet_with_state(l1_read_addr, remote_l1_write_addr, noc);
-            l1_read_addr += page_size;
-            remote_l1_write_addr += page_size;
+        for (uint32_t core_id = 0; core_id < 2; ++core_id) { // Iterate over two neighboring cores
+            uint64_t l1_noc_write_addr_for_receiver_core = 0;
+            uint32_t l1_read_addr = get_read_ptr(cb_id);
+            if (core_id == 0) {
+                l1_noc_write_addr_for_receiver_core = l1_noc_write_addr1; // Set write pointer to start of cb for first core
+            } else {
+                l1_noc_write_addr_for_receiver_core = l1_noc_write_addr2; // Set write pointer to start of cb for second core
+                l1_read_addr += page_size * num_pages_w_per_receiver; // Stride read pointer to start of second core
+            }
+
+            noc_async_write_one_packet_set_state(l1_noc_write_addr_for_receiver_core, page_size, noc, vc); // Set state to write a page to noc/vc
+
+            for (uint32_t h = 0; h < num_tiles_h; ++h) { // Iterate over page rows per receiver core
+                for (uint32_t w = 0; w < num_pages_w_per_receiver; ++w) { // Iterate over page columns per receiver core
+                    noc_async_write_one_packet_with_state(l1_read_addr, l1_noc_write_addr_for_receiver_core, noc);
+                    l1_read_addr += page_size;
+                    l1_noc_write_addr_for_receiver_core += page_size;
+                }
+                l1_read_addr += page_size * num_pages_w_per_receiver; // Stride read pointer over other core's data
+            }
         }
 
         noc_async_write_barrier(noc);
 
         cb_pop_front(cb_id, block_num_tiles);
-
     }
 
-
 }