diff --git a/tests/scripts/test_moreh_microbenchmark.py b/tests/scripts/test_moreh_microbenchmark.py index c93b82c45d6..1c418bc6e0b 100755 --- a/tests/scripts/test_moreh_microbenchmark.py +++ b/tests/scripts/test_moreh_microbenchmark.py @@ -684,6 +684,7 @@ def test_matmul_single_core_sharded( [ ("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 0, 12, 0), ("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 1, 12, 0), + ("wormhole_b0", 1000, np.array([2048, 3840]), 1, 4, 1, 12, 0), # Padded FF1 shapes for llama 70b on TG ], ) def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id): @@ -722,14 +723,61 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form @pytest.mark.parametrize( - "arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id", + "arch, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id, bw_target", [ - ("grayskull", 1202, np.array([32768 * 2, 8 * 128]), 1, 64, 1, 8, 0), - ("wormhole_b0", 1000, np.array([32768 * 2, 12 * 128]), 1, 64, 1, 12, 0), - ("blackhole", 800, np.array([32768 * 8, 8 * 128]), 1, 256, 1, 8, 0), + ("grayskull", np.array([32768 * 2, 8 * 128]), 1, 64, 1, 8, 0, None), + ("wormhole_b0", np.array([32768 * 2, 12 * 128]), 1, 64, 2, 12, 0, None), + ("blackhole", np.array([32768 * 8, 8 * 128]), 1, 256, 2, 8, 0, None), + # FF1/FF3 shapes for TG llama 70b + ( + "wormhole_b0", + np.array([2048, 3840]), + 1, + 16, + 0, + 12, + 0, + 240, + ), # 244 GB/s + # FF2 shapes for TG llama 70b + ( + "wormhole_b0", + np.array([3584, 2304]), + 1, + 28, + 1, + 12, + 0, + 250, + ), # 255 GB/s + # Dense Out shapes for TG llama 70b + ( + "wormhole_b0", + np.array([1024, 2304]), + 1, + 8, + 1, + 12, + 0, + 220, + ), # 226 GB/s + # QKV shapes for TG llama 70b + ( + "wormhole_b0", + np.array([2048, 1536]), + 1, + 16, + 1, + 12, + 0, + 225, + ), # 232 GB/s ], ) -def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id): +def test_dram_read_l1_write_core( + arch, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id, bw_target +): + dev_freq = get_device_freq() data = [] cycle_list = [] time_list = [] @@ -737,14 +785,16 @@ def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, dat for _ in range(num_tests): k = int(test_vector[0]) n = int(test_vector[1]) - if data_format == 0: + if data_format == 0: # BFP4 + input_size = k * n * (512 + 64) // 1024 + elif data_format == 1: # BFP8 input_size = k * n * 1088 // 1024 - elif data_format == 1: + elif data_format == 2: # BFLOAT16 input_size = k * n * 2048 // 1024 run_dram_read_l1_write_cmd(k, n, nblock, data_format, num_banks, bank_start_id) cycle = profile_results_kernel_duration() - time = cycle / freq / 1000.0 / 1000.0 - throughput = input_size / cycle * freq / 1000.0 + time = cycle / dev_freq / 1000.0 / 1000.0 + throughput = input_size / cycle * dev_freq / 1000.0 cycle_list.append(cycle) time_list.append(time) throughput_list.append(throughput) @@ -756,13 +806,15 @@ def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, dat logger.info("DRAM read throughput: " + str(throughput)) data.append([throughput]) # check within range - dev_freq = get_device_freq() if arch == "grayskull": - bw_bound = 100.0 + bw_bound = 70.0 # Equals 85 GB/s with 1200 MHz elif arch == "wormhole_b0": bw_bound = 260.0 elif arch == "blackhole": bw_bound = 340.0 + if bw_target is not None: + bw_bound = bw_target + bw_bound = bw_bound * dev_freq / 1000.0 # Adjust for device frequency; target is based on max device frequency assert bw_bound <= throughput diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/writer_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/writer_l1.cpp index 3184c98f187..e48499e4883 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/writer_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/writer_l1.cpp @@ -11,40 +11,55 @@ void kernel_main() { constexpr uint32_t num_blocks = get_compile_time_arg_val(0); - constexpr uint32_t num_pages = get_compile_time_arg_val(1); - constexpr uint32_t block_num_tiles = get_compile_time_arg_val(2); - constexpr uint32_t page_size = get_compile_time_arg_val(3); - constexpr uint32_t noc = get_compile_time_arg_val(4); + constexpr uint32_t num_pages_w_per_receiver = get_compile_time_arg_val(1); + constexpr uint32_t num_tiles_h = get_compile_time_arg_val(2); + constexpr uint32_t block_num_tiles = get_compile_time_arg_val(3); + constexpr uint32_t page_size = get_compile_time_arg_val(4); + constexpr uint32_t noc = get_compile_time_arg_val(5); const uint32_t vc = get_arg_val(0); - const uint32_t noc_x = get_arg_val(1); - const uint32_t noc_y = get_arg_val(2); + // First L1 writer core coordinates + const uint32_t noc_x1 = get_arg_val(1); + const uint32_t noc_y1 = get_arg_val(2); + // Second L1 writer core coordinates + const uint32_t noc_x2 = get_arg_val(3); + const uint32_t noc_y2 = get_arg_val(4); constexpr uint32_t cb_id = 0; uint32_t l1_write_addr = get_write_ptr(cb_id); - const uint64_t l1_noc_write_addr = get_noc_addr(noc_x, noc_y, l1_write_addr, noc); + const uint64_t l1_noc_write_addr1 = get_noc_addr(noc_x1, noc_y1, l1_write_addr, noc); + const uint64_t l1_noc_write_addr2 = get_noc_addr(noc_x2, noc_y2, l1_write_addr, noc); - noc_async_write_one_packet_set_state(l1_noc_write_addr, page_size, noc, vc); - - for (uint32_t block = 0; block < num_blocks; ++block) { - - auto remote_l1_write_addr = l1_noc_write_addr; + for (uint32_t block = 0; block < num_blocks; ++block) { // Iterate over blocks cb_wait_front(cb_id, block_num_tiles); - auto l1_read_addr = get_read_ptr(cb_id); - for (uint32_t h = 0; h < num_pages; ++h) { - noc_async_write_one_packet_with_state(l1_read_addr, remote_l1_write_addr, noc); - l1_read_addr += page_size; - remote_l1_write_addr += page_size; + for (uint32_t core_id = 0; core_id < 2; ++core_id) { // Iterate over two neighboring cores + uint64_t l1_noc_write_addr_for_receiver_core = 0; + uint32_t l1_read_addr = get_read_ptr(cb_id); + if (core_id == 0) { + l1_noc_write_addr_for_receiver_core = l1_noc_write_addr1; // Set write pointer to start of cb for first core + } else { + l1_noc_write_addr_for_receiver_core = l1_noc_write_addr2; // Set write pointer to start of cb for second core + l1_read_addr += page_size * num_pages_w_per_receiver; // Stride read pointer to start of second core + } + + noc_async_write_one_packet_set_state(l1_noc_write_addr_for_receiver_core, page_size, noc, vc); // Set state to write a page to noc/vc + + for (uint32_t h = 0; h < num_tiles_h; ++h) { // Iterate over page rows per receiver core + for (uint32_t w = 0; w < num_pages_w_per_receiver; ++w) { // Iterate over page columns per receiver core + noc_async_write_one_packet_with_state(l1_read_addr, l1_noc_write_addr_for_receiver_core, noc); + l1_read_addr += page_size; + l1_noc_write_addr_for_receiver_core += page_size; + } + l1_read_addr += page_size * num_pages_w_per_receiver; // Stride read pointer over other core's data + } } noc_async_write_barrier(noc); cb_pop_front(cb_id, block_num_tiles); - } - } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp index 394e57372c6..2b05307cd22 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp @@ -10,7 +10,9 @@ #include #include #include +#include +#include "common/bfloat4.hpp" #include "common/bfloat8.hpp" #include "common/bfloat16.hpp" #include "common/tt_backend_api_types.hpp" @@ -63,14 +65,18 @@ std::vector slice_vec(std::vector const &v, int m, int n) { return vec; } -void get_max_page_size_and_num_pages(uint32_t num_tiles, uint32_t tile_size, uint32_t& page_size, uint32_t& num_pages) { - uint64_t total_size = static_cast(num_tiles) * tile_size; +void get_max_page_size_and_num_pages(uint32_t num_tiles_w, uint32_t num_tiles_h, uint32_t tile_size, uint32_t& page_size, uint32_t& num_pages, uint32_t& num_pages_w_per_receiver) { + uint64_t half_row_bytes = static_cast(num_tiles_w / 2) * tile_size; + TT_ASSERT(num_tiles_w % 2 == 0, "num_tiles_w {} must be divisible by 2", num_tiles_w); page_size = (8192 / tile_size) * tile_size; - while (total_size % page_size != 0 && page_size >= tile_size) { + // Each receiver core receives half the data, so each receiver cores's block size is half of the total block size + while (half_row_bytes % page_size != 0 && page_size > tile_size) { page_size -= tile_size; } - num_pages = total_size / page_size; + TT_ASSERT(page_size % tile_size == 0, "page_size must be a multiple of tile_size!"); + num_pages = num_tiles_w * num_tiles_h * tile_size / page_size; + num_pages_w_per_receiver = half_row_bytes / page_size; } std::tuple create_program( @@ -101,8 +107,11 @@ std::tuple create_program( // DRAM reader CB uint32_t reader_cb_index = 0; uint32_t reader_cb_size = block_h * block_w * single_tile_size * 3; - uint32_t page_size, num_pages; - get_max_page_size_and_num_pages(block_num_tiles, single_tile_size, page_size, num_pages); + uint32_t page_size, num_pages, num_pages_w_per_receiver; + get_max_page_size_and_num_pages(block_w, block_h, single_tile_size, page_size, num_pages, num_pages_w_per_receiver); + + log_info("Input block size: {}x{}, num_blocks: {}", block_h, block_w, num_blocks); + log_info("Pages set up as page_size: {}, num_pages: {}, num_pages_w_per_receiver: {}", page_size, num_pages, num_pages_w_per_receiver); uint32_t reader_cb_addr = device->get_base_allocator_addr(HalMemType::L1); tt_metal::CircularBufferConfig reader_cb_config = @@ -132,7 +141,8 @@ std::tuple create_program( std::vector writer_compile_time_args = { (std::uint32_t) num_blocks, - (std::uint32_t) num_pages, + (std::uint32_t) num_pages_w_per_receiver, + (std::uint32_t) block_h, (std::uint32_t) block_num_tiles, (std::uint32_t) page_size, (std::uint32_t) tt_metal::NOC::RISCV_0_default @@ -174,13 +184,19 @@ std::tuple create_program( tt_metal::SetRuntimeArgs(program, reader_kernel, core, reader_rt_args); - auto writer_core = all_l1_writer_cores_ordered[i]; - auto writer_core_phy = device->worker_core_from_logical_core(writer_core); + auto writer_core1 = all_l1_writer_cores_ordered[i*2]; + auto writer_core_phy1 = device->worker_core_from_logical_core(writer_core1); + auto writer_core2 = all_l1_writer_cores_ordered[(i*2)+1]; + auto writer_core_phy2 = device->worker_core_from_logical_core(writer_core2); const std::array writer_rt_args = { (std::uint32_t) (vc + 2) & 0x3, - (std::uint32_t) writer_core_phy.x, - (std::uint32_t) writer_core_phy.y + // First L1 receiver core coordinates + (std::uint32_t) writer_core_phy1.x, + (std::uint32_t) writer_core_phy1.y, + // Second L1 receiver core coordinates + (std::uint32_t) writer_core_phy2.x, + (std::uint32_t) writer_core_phy2.y }; tt_metal::SetRuntimeArgs(program, writer_kernel, core, writer_rt_args); @@ -188,68 +204,102 @@ std::tuple create_program( return {std::move(program), reader_kernel, reader_cb_addr}; } +template +bool validate_data( + const std::vector& result_data, + const std::vector& input_data, + uint32_t block_h, + uint32_t block_w_per_receiver, + uint32_t block_w, + uint32_t datums_per_tile, + uint32_t num_banks, + uint32_t input_start_index_for_core) +{ + for (uint32_t r = 0; r < block_h; ++r) { + for (uint32_t c = 0; c < block_w_per_receiver; ++c) { + uint32_t one_row_bytes = block_w * datums_per_tile * num_banks; + uint32_t input_step = input_start_index_for_core + r * one_row_bytes + c * datums_per_tile * num_banks; + auto input_begin = input_data.begin() + input_step; + auto input_end = input_begin + datums_per_tile; + std::vector input_slice(input_begin, input_end); + + uint32_t result_step = r * (datums_per_tile * block_w_per_receiver) + c * datums_per_tile; + auto result_begin = result_data.begin() + result_step; + auto result_end = result_begin + datums_per_tile; + std::vector result_slice(result_begin, result_end); + + if (input_slice != result_slice) { + return false; + } + } + } + return true; +} + bool validation( tt_metal::Device *device, tt_metal::Buffer &input_buffer, std::vector &input_vec, - const uint32_t &num_cores, + uint32_t num_cores, std::vector &all_cores, - const uint32_t &num_tiles_per_core, - const uint32_t &cb_addr, - const uint32_t &single_tile_size, + uint32_t num_tiles_per_core, + uint32_t cb_addr, + uint32_t single_tile_size, uint32_t num_tiles_cb, uint32_t df, uint32_t num_banks, uint32_t num_blocks, - uint32_t block_h, - uint32_t block_w, - uint32_t num_datum_per_slice) { + uint32_t block_h, // block_h per core + uint32_t block_w, // block_w per core + uint32_t block_w_per_receiver, + uint32_t datums_per_tile) { // 32x32 uint32_t core_id = 0; - for (auto core: all_cores) { + uint32_t num_datum_per_block = block_h * block_w * num_cores * datums_per_tile; + uint32_t last_block_offset = (num_blocks - 1) * num_datum_per_block; + uint32_t tiles_per_core = block_h * block_w_per_receiver; // Num slices=tiles per core to verify + for (auto core: all_cores | std::views::take(num_cores*2)) { + + uint32_t dram_bank_id = core_id / 2; // A pair of two cores share a dram bank + uint32_t tile_stride_over_dram_banks = dram_bank_id * datums_per_tile; + uint32_t is_second_core = core_id % 2; + // Second core in a dram bank pair has an offset of half a block from that dram bank + uint32_t receiver_core_pair_offset = is_second_core * datums_per_tile * block_w_per_receiver * num_banks; + uint32_t input_start_index_for_core = last_block_offset + tile_stride_over_dram_banks + receiver_core_pair_offset; + std::vector result_vec; tt_metal::detail::ReadFromDeviceL1( - device, core, cb_addr, num_tiles_cb * single_tile_size, result_vec); - - uint32_t num_datum_per_block = block_h * block_w * num_datum_per_slice; - uint32_t tensor_slice_stride = core_id * num_datum_per_slice; - uint32_t last_block_offset = (num_blocks - 1) * num_datum_per_block * num_banks; - uint32_t start_index = tensor_slice_stride + last_block_offset; - uint32_t num_slices = block_h * block_w; - - if (df == 0) { + device, core, cb_addr, num_tiles_cb / 2 * single_tile_size, result_vec); + + if (df == 0) { // BFP4 + auto result_bfp4 = unpack_bfp4_tiles_into_float_vec(result_vec, true, true); + auto input_bfp4 = unpack_bfp4_tiles_into_float_vec(input_vec, true, true); + if (!validate_data( + result_bfp4, input_bfp4, block_h, block_w_per_receiver, block_w, + datums_per_tile, num_banks, input_start_index_for_core)) { + return false; + } + } else if (df == 1) { // BFP8 auto result_bfp8 = unpack_bfp8_tiles_into_float_vec(result_vec, true, true); auto input_bfp8 = unpack_bfp8_tiles_into_float_vec(input_vec, true, true); - - for (uint32_t i=0; i < num_slices; ++i) { - uint32_t input_step = start_index + i * num_datum_per_slice * num_banks; - std::vector input_slice(input_bfp8.begin() + input_step, input_bfp8.begin() + input_step + num_datum_per_slice); - uint32_t result_step = i * num_datum_per_slice; - std::vector result_slice(result_bfp8.begin() + result_step, result_bfp8.begin() + result_step + num_datum_per_slice); - - if (input_slice != result_slice) { - return false; - } + if (!validate_data( + result_bfp8, input_bfp8, block_h, block_w_per_receiver, block_w, + datums_per_tile, num_banks, input_start_index_for_core)) { + return false; } - - } else { + } else if (df == 2) { // BFLOAT16 auto result_bf16 = unpack_uint32_vec_into_bfloat16_vec(result_vec); auto input_bf16 = unpack_uint32_vec_into_bfloat16_vec(input_vec); - - for (uint32_t i=0; i < num_slices; ++i) { - uint32_t input_step = start_index + i * num_datum_per_slice * num_banks; - std::vector input_slice(input_bf16.begin() + input_step, input_bf16.begin() + input_step + num_datum_per_slice); - uint32_t result_step = i * num_datum_per_slice; - std::vector result_slice(result_bf16.begin() + result_step, result_bf16.begin() + result_step + num_datum_per_slice); - - if (input_slice != result_slice) { - return false; - } + if (!validate_data( + result_bf16, input_bf16, block_h, block_w_per_receiver, block_w, + datums_per_tile, num_banks, input_start_index_for_core)) { + return false; } } core_id ++; } + log_info("Validation passed."); return true; } @@ -399,6 +449,9 @@ void get_l1_writer_core_coords_blackhole( uint32_t adj_core_x = dram_reader_core_phy.x + 1; uint32_t adj_core_y = dram_reader_core_phy.y; adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); + uint32_t adj_core_x2 = dram_reader_core_phy.x + 2; + uint32_t adj_core_y2 = dram_reader_core_phy.y; + adj_core_physical.push_back(CoreCoord(adj_core_x2, adj_core_y2)); } // move worker if they are in the harvested rows @@ -564,6 +617,9 @@ void get_l1_writer_core_coords_grayskull( uint32_t adj_core_x = dram_reader_core_phy.x; uint32_t adj_core_y = dram_reader_core_phy.y + 1; adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); + uint32_t adj_core_x2 = dram_reader_core_phy.x + 1; + uint32_t adj_core_y2 = dram_reader_core_phy.y + 1; + adj_core_physical.push_back(CoreCoord(adj_core_x2, adj_core_y2)); } // move worker if they are in the harvested rows @@ -673,9 +729,12 @@ void get_l1_writer_core_coords_wormhole_b0( for (int i = 0; i < all_dram_reader_cores.size(); ++i) { auto dram_reader_core = all_dram_reader_cores[i]; auto dram_reader_core_phy = device->worker_core_from_logical_core(dram_reader_core); - uint32_t adj_core_x = dram_reader_core_phy.x + 1; - uint32_t adj_core_y = dram_reader_core_phy.y; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); + uint32_t adj_core_x1 = dram_reader_core_phy.x + 1; + uint32_t adj_core_y1 = dram_reader_core_phy.y; + adj_core_physical.push_back(CoreCoord(adj_core_x1, adj_core_y1)); + uint32_t adj_core_x2 = dram_reader_core_phy.x + 2; + uint32_t adj_core_y2 = dram_reader_core_phy.y; + adj_core_physical.push_back(CoreCoord(adj_core_x2, adj_core_y2)); } // find the logical coord from physical coord @@ -706,7 +765,7 @@ int main(int argc, char **argv) { bool pass = true; bool use_device_profiler = false; bool bypass_check = false; - uint32_t df = 0; + uint32_t df = 2; std::vector dram_bandwidth; uint32_t num_tests = 1; uint32_t num_blocks = 8; @@ -742,7 +801,7 @@ int main(int argc, char **argv) { test_args::has_command_option_and_remaining_args(input_args, "--bypass-check"); std::tie(df, input_args) = - test_args::get_command_option_uint32_and_remaining_args(input_args, "--data-type", 0); + test_args::get_command_option_uint32_and_remaining_args(input_args, "--data-type", 2); std::tie(num_banks, input_args) = test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-banks", 12); @@ -776,10 +835,13 @@ int main(int argc, char **argv) { //////////////////////////////////////////////////////////////////////////// uint32_t input_size = 0; tt::DataFormat tile_format = tt::DataFormat::Bfp8_b; - if (df == 0) { + if (df == 0) { // BFP4 + input_size = k * n * (512+64) / 1024; + tile_format = tt::DataFormat::Bfp4_b; + } else if (df == 1) { // BFP8 input_size = k * n * 1088 / 1024; tile_format = tt::DataFormat::Bfp8_b; - } else if (df == 1) { + } else if (df == 2) { // BFLOAT16 input_size = k * n * 2; tile_format = tt::DataFormat::Float16_b; } else { @@ -789,6 +851,7 @@ int main(int argc, char **argv) { uint32_t nt = n / 32; uint32_t block_h = kt / num_blocks; uint32_t block_w = nt / num_banks; + uint32_t block_w_per_receiver = block_w / 2; uint32_t num_datum_per_slice = 32 * 32; uint32_t single_tile_size = tt_metal::detail::TileSize(tile_format); @@ -808,8 +871,6 @@ int main(int argc, char **argv) { tt_metal::Device *device = tt_metal::CreateDevice(device_id); dram_bandwidth_spec = get_dram_bandwidth(device->arch()); - TT_ASSERT(device->arch() == ARCH::WORMHOLE_B0, "device must be wh_b0"); - int clock_freq_mhz = get_tt_npu_clock(device); uint32_t num_tiles = static_cast((input_size + single_tile_size - 1) / single_tile_size); @@ -847,7 +908,7 @@ int main(int argc, char **argv) { log_info( LogTest, "Measuring DRAM bandwidth for input_size = {} bytes ({:.3f} MB, " - "{} tiles), using {} cores", + "{} tiles), using {} DRAM reading cores", input_size, static_cast(input_size) / 1024 / 1024, num_tiles, @@ -856,15 +917,16 @@ int main(int argc, char **argv) { //////////////////////////////////////////////////////////////////////////// // Input Setup //////////////////////////////////////////////////////////////////////////// + // DEBUGGING: Create a vector of bfloat16s where each element contains the tile number + std::vector input_vec; - if (tile_format == tt::DataFormat::Bfp8_b) { - // input_vec = create_constant_vector_of_bfp8( - // input_size, 100, true); + if (tile_format == tt::DataFormat::Bfp4_b) { + input_vec = create_random_vector_of_bfp4( + input_size, false, 100, 1234); + } else if (tile_format == tt::DataFormat::Bfp8_b) { input_vec = create_random_vector_of_bfp8( - input_size, true, 100, 1234); + input_size, false, 100, 1234); } else { - // input_vec = create_constant_vector_of_bfloat16( - // input_size * total_banks / num_banks, 100); input_vec = create_random_vector_of_bfloat16( input_size, 100, 1234); } @@ -922,8 +984,13 @@ int main(int argc, char **argv) { num_blocks, block_h, block_w, + block_w_per_receiver, num_datum_per_slice); + if (!pass) { + log_info(LogTest, "Validation failed"); + } + pass &= tt_metal::CloseDevice(device); } catch (const std::exception &e) { pass = false;