From a0589f11a5b6791589e059f3ca624b8c80aacd5b Mon Sep 17 00:00:00 2001 From: Arik Yaacob Date: Fri, 1 Nov 2024 08:13:53 -0500 Subject: [PATCH] #13637: Enable L-shape multicast API and add tests (#14309) * #13637: Adding L-shape multicast - low level API, test. Still WIP * #13637: Added L-shape multicast API and tests * #13637: update docs and not limit tests to slow dispatch * #13637: fix pre-commit failures * #13637: clean up debug code * #13637: fix noc 1 core calculations, wrap BH only code with #ifdef * #13637: internally handle noc1 exclusion direction, update API method name * #0: fix typo --- .../dram_to_l1_multicast_exclude_region.cpp | 46 ++++++++ .../dram/test_dram_to_l1_multicast.cpp | 98 +++++++++++++++-- .../hw/inc/blackhole/noc_nonblocking_api.h | 65 +++++++++++ tt_metal/hw/inc/dataflow_api.h | 101 +++++++++++++++++- 4 files changed, 296 insertions(+), 14 deletions(-) create mode 100644 tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp new file mode 100644 index 00000000000..6576fc1301e --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +void kernel_main() { + uint32_t src_addr = get_arg_val(0); + uint32_t src_noc_x = get_arg_val(1); + uint32_t src_noc_y = get_arg_val(2); + uint32_t src_buffer_size = get_arg_val(3); + + uint32_t local_addr = get_arg_val(4); + + uint32_t dst_addr = get_arg_val(5); + uint32_t dst_noc_x_start = get_arg_val(6); + uint32_t dst_noc_y_start = get_arg_val(7); + uint32_t dst_noc_x_end = get_arg_val(8); + uint32_t dst_noc_y_end = get_arg_val(9); + uint32_t num_dests = get_arg_val(10); + uint32_t exclude_start_x = get_arg_val(11); + uint32_t exclude_start_y = get_arg_val(12); + uint32_t exclude_dir_x = get_arg_val(13); + uint32_t exclude_dir_y = get_arg_val(14); + + + // Read src buffer into local L1 buffer + uint64_t src_buffer_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + noc_async_read(src_buffer_noc_addr, local_addr, src_buffer_size); + noc_async_read_barrier(); + + // multicast local L1 buffer to all destination cores + uint64_t dst_noc_multicast_addr = get_noc_multicast_addr( + dst_noc_x_start, + dst_noc_y_start, + dst_noc_x_end, + dst_noc_y_end, + dst_addr); + uint32_t noc_exclude_region = get_noc_exclude_region( + exclude_start_x, + exclude_start_y, + exclude_dir_x, + exclude_dir_y); + noc_async_write_multicast_exclude_region(local_addr, dst_noc_multicast_addr, src_buffer_size, num_dests, noc_exclude_region); + noc_async_write_barrier(); +} diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram_to_l1_multicast.cpp index 07cd34ff2ef..3a309ec8fba 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram_to_l1_multicast.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram_to_l1_multicast.cpp @@ -19,6 +19,8 @@ struct DRAMtoL1MulticastConfig{ std::uint32_t dest_buffer_addr; std::uint32_t target_grid_offset; std::string kernel_file; + CoreCoord exclude_start; + CoreCoord exclude_direction; }; bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, const DRAMtoL1MulticastConfig &cfg){ @@ -53,6 +55,14 @@ bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, cons CoreCoord core_end = {core_start.x + (grid_size.x - 1), core_start.y + (grid_size.y - 1)}; auto core_start_physical = device->worker_core_from_logical_core(core_start); auto core_end_physical = device->worker_core_from_logical_core(core_end); + auto core_exclude_physical = device->worker_core_from_logical_core(cfg.exclude_start); + auto num_dests = (grid_size.x * grid_size.y) - cfg.target_grid_offset; + // calculate number of destination cores, taking exluded ones into account + if (cfg.exclude_start.x != 0 || cfg.exclude_start.y != 0) { + auto num_x = cfg.exclude_direction.x == 1 ? grid_size.x - cfg.exclude_start.x : cfg.exclude_start.x + 1; + auto num_y = cfg.exclude_direction.y == 1 ? grid_size.y - cfg.exclude_start.y : cfg.exclude_start.y + 1; + num_dests = (grid_size.x * grid_size.y) - num_x * num_y - cfg.target_grid_offset; + } std::vector mcast_reader_args = { (std::uint32_t)dram_buffer_addr, (std::uint32_t)dram_noc_xy.x, @@ -64,10 +74,15 @@ bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, cons (std::uint32_t)core_end_physical.y, (std::uint32_t)core_start_physical.x, (std::uint32_t)core_start_physical.y, - (std::uint32_t)(grid_size.x * grid_size.y) - cfg.target_grid_offset}; // Note: exclude src from acks, since we are not setting NOC_CMD_BRCST_SRC_INCLUDE + (std::uint32_t)num_dests, + (std::uint32_t)core_exclude_physical.x, + (std::uint32_t)core_exclude_physical.y, + (std::uint32_t)cfg.exclude_direction.x, + (std::uint32_t)cfg.exclude_direction.y,}; // Note: exclude src from acks, since we are not setting NOC_CMD_BRCST_SRC_INCLUDE log_debug(LogTest, "Start = {}, {}", core_start_physical.x, core_start_physical.y); log_debug(LogTest, "End = {}, {}", core_end_physical.x, core_end_physical.y); + log_debug(LogTest, "Exclude = {}, {}", core_exclude_physical.x, core_exclude_physical.y); auto mcast_reader_kernel = tt_metal::CreateKernel( program, cfg.kernel_file, @@ -87,6 +102,12 @@ bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, cons for(int i = 0 ; i < grid_size.y; i++) { for(int j = 0 ; j < grid_size.x; j++) { + // don't compare on skipped cores + if ( ((cfg.exclude_direction.x == 0 && j <= cfg.exclude_start.x) || (cfg.exclude_direction.x == 1 && j >= cfg.exclude_start.x)) && + ((cfg.exclude_direction.y == 0 && i <= cfg.exclude_start.y) || (cfg.exclude_direction.y == 1 && i >= cfg.exclude_start.y))) { + tt::log_debug(tt::LogTest, "Skipping core {},{}", j, i); // debug print to verify we don't skip unnecessary cores + continue; + } CoreCoord dest_core = {(std::size_t) core_start.x + j, (std::size_t) core_start.y + i}; std::vector dest_core_data; tt_metal::detail::ReadFromDeviceL1(device, dest_core, dest_buffer_addr, dram_buffer_size, dest_core_data); @@ -103,10 +124,6 @@ bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, cons } TEST_F(CommonFixture, DRAMtoL1Multicast){ - if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){ - tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode"); - GTEST_SKIP(); - } unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = { .dest_buffer_addr = 200 * 1024, .target_grid_offset = 1, @@ -117,10 +134,6 @@ TEST_F(CommonFixture, DRAMtoL1Multicast){ } } TEST_F(CommonFixture, DRAMtoL1MulticastLoopbackSrc){ - if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){ - tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode"); - GTEST_SKIP(); - } unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = { .dest_buffer_addr = 500 * 1024, .target_grid_offset = 0, @@ -130,3 +143,70 @@ TEST_F(CommonFixture, DRAMtoL1MulticastLoopbackSrc){ ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config)); } } +TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionUpLeft){ + unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = { + .dest_buffer_addr = 200 * 1024, + .target_grid_offset = 0, //source core is in exclusion zone, don't count twice + .kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp", + .exclude_start = {10, 6}, + .exclude_direction = {0, 0} + }; + for (unsigned int id=0; id < devices_.size(); id++){ + if (!(this->devices_.at(id)->arch() == tt::ARCH::BLACKHOLE)){ + tt::log_info(tt::LogTest, "This test is only supported on Blackhole"); + GTEST_SKIP(); + } + ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config)); + } +} + +TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionUpRight){ + unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = { + .dest_buffer_addr = 200 * 1024, + .target_grid_offset = 1, + .kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp", + .exclude_start = {10, 6}, + .exclude_direction = {1, 0} + }; + for (unsigned int id=0; id < devices_.size(); id++){ + if (!(this->devices_.at(id)->arch() == tt::ARCH::BLACKHOLE)){ + tt::log_info(tt::LogTest, "This test is only supported on Blackhole"); + GTEST_SKIP(); + } + ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config)); + } +} + +TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionDownLeft){ + unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = { + .dest_buffer_addr = 200 * 1024, + .target_grid_offset = 1, + .kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp", + .exclude_start = {10, 6}, + .exclude_direction = {0, 1} + }; + for (unsigned int id=0; id < devices_.size(); id++){ + if (!(this->devices_.at(id)->arch() == tt::ARCH::BLACKHOLE)){ + tt::log_info(tt::LogTest, "This test is only supported on Blackhole"); + GTEST_SKIP(); + } + ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config)); + } +} + +TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionDownRight){ + unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = { + .dest_buffer_addr = 200 * 1024, + .target_grid_offset = 1, + .kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp", + .exclude_start = {10, 6}, + .exclude_direction = {1, 1} + }; + for (unsigned int id=0; id < devices_.size(); id++){ + if (!(this->devices_.at(id)->arch() == tt::ARCH::BLACKHOLE)){ + tt::log_info(tt::LogTest, "This test is only supported on Blackhole"); + GTEST_SKIP(); + } + ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config)); + } +} diff --git a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h index 8ec6b3921bd..3501cea7279 100644 --- a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h @@ -115,6 +115,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write( NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, (uint32_t)dest_addr); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_MID, (uint32_t)(dest_addr >> 32) & 0x1000000F); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_COORDINATE, (uint32_t)(dest_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_BRCST_EXCLUDE, 0); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); if (posted) { @@ -146,12 +147,41 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_loopback_src( NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, (uint32_t)dest_addr); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_MID, (uint32_t)(dest_addr >> 32) & 0x1000000F); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_COORDINATE, (uint32_t)(dest_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_BRCST_EXCLUDE, 0); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); noc_nonposted_writes_num_issued[noc] += 1; noc_nonposted_writes_acked[noc] += num_dests; } +inline __attribute__((always_inline)) void ncrisc_noc_fast_write_exclude_region( + uint32_t noc, + uint32_t cmd_buf, + uint32_t src_addr, + uint64_t dest_addr, + uint32_t len_bytes, + uint32_t vc, + bool mcast, + bool linked, + uint32_t num_dests, + bool multicast_path_reserve, + uint32_t exclude_region) { + uint32_t noc_cmd_field = + NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc) | (linked ? NOC_CMD_VC_LINKED : 0x0) | + (mcast ? ((multicast_path_reserve ? NOC_CMD_PATH_RESERVE : 0) | NOC_CMD_BRCST_PACKET) : 0x0) | NOC_CMD_RESP_MARKED; + + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CTRL, noc_cmd_field); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_LO, src_addr); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, (uint32_t)dest_addr); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_MID, (uint32_t)(dest_addr >> 32) & 0x1000000F); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_COORDINATE, (uint32_t)(dest_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_BRCST_EXCLUDE, exclude_region); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); + noc_nonposted_writes_num_issued[noc] += 1; + noc_nonposted_writes_acked[noc] += num_dests; + } + inline __attribute__((always_inline)) void ncrisc_noc_blitz_write_setup( uint32_t noc, uint32_t cmd_buf, uint64_t dest_addr, uint32_t len_bytes, uint32_t vc, uint32_t num_times_to_write) { uint32_t noc_cmd_field = NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc) | NOC_CMD_RESP_MARKED; @@ -343,6 +373,41 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_loopbac noc, cmd_buf, src_addr, dest_addr, len_bytes, vc, mcast, linked, num_dests, multicast_path_reserve); } +inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_exclude_region( + uint32_t noc, + uint32_t cmd_buf, + uint32_t src_addr, + uint64_t dest_addr, + uint32_t len_bytes, + uint32_t vc, + bool mcast, + bool linked, + uint32_t num_dests, + bool multicast_path_reserve, + uint32_t exclude_region = 0) { + while (len_bytes > NOC_MAX_BURST_SIZE) { + while (!noc_cmd_buf_ready(noc, cmd_buf)); + ncrisc_noc_fast_write_exclude_region( + noc, + cmd_buf, + src_addr, + dest_addr, + NOC_MAX_BURST_SIZE, + vc, + mcast, + linked, + num_dests, + multicast_path_reserve, + exclude_region); + src_addr += NOC_MAX_BURST_SIZE; + dest_addr += NOC_MAX_BURST_SIZE; + len_bytes -= NOC_MAX_BURST_SIZE; + } + while (!noc_cmd_buf_ready(noc, cmd_buf)); + ncrisc_noc_fast_write_exclude_region( + noc, cmd_buf, src_addr, dest_addr, len_bytes, vc, mcast, linked, num_dests, multicast_path_reserve, exclude_region); +} + inline __attribute__((always_inline)) void noc_fast_write_dw_inline( uint32_t noc, uint32_t cmd_buf, diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index 1223a7799e5..899184d397a 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -75,6 +75,14 @@ extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS]; #define NOC_MULTICAST_WRITE_VC 4 #define NOC_DISPATCH_MULTICAST_WRITE_VC 5 // Only to be used by the dispatch cores +#define EXCLUDE_ENABLED 1 +#define EXCLUDE_ENABLED_OFFSET 22 +#define EXCLUDE_DIRECTION_Y_OFFSET 21 +#define EXCLUDE_DIRECTION_X_OFFSET 20 +#define EXCLUDE_START_Y_OFFSET 14 +#define EXCLUDE_START_X_OFFSET 8 +#define DYNAMIC_NOC_DIRECTION(noc, direction) (noc == 1 ? 1 - direction : direction) + FORCE_INLINE uint32_t align(uint32_t addr, uint32_t alignment) { return ((addr - 1) | (alignment - 1)) + 1; } @@ -619,6 +627,23 @@ std::uint64_t get_noc_addr_helper(std::uint32_t noc_xy, std::uint32_t addr) { return ((uint64_t)(noc_xy) << NOC_ADDR_COORD_SHIFT) | addr; } +FORCE_INLINE +std::uint32_t get_noc_exclude_region( + std::uint32_t exclude_start_x, + std::uint32_t exclude_start_y, + std::uint32_t exclude_dir_x, + std::uint32_t exclude_dir_y, + uint8_t noc = noc_index) { + /* + Get an encoding which contians the definition of the exclusion area + */ + return (EXCLUDE_ENABLED << EXCLUDE_ENABLED_OFFSET | + DYNAMIC_NOC_DIRECTION(noc, exclude_dir_y) << EXCLUDE_DIRECTION_Y_OFFSET | + DYNAMIC_NOC_DIRECTION(noc, exclude_dir_x) << EXCLUDE_DIRECTION_X_OFFSET | + DYNAMIC_NOC_Y(noc, exclude_start_y) << EXCLUDE_START_Y_OFFSET | + DYNAMIC_NOC_X(noc, exclude_start_x) << EXCLUDE_START_X_OFFSET); +} + uint64_t get_dram_noc_addr(const uint32_t id, const uint32_t page_size, const uint32_t bank_base_address, const uint32_t offset = 0, uint8_t noc = noc_index) { @@ -1335,7 +1360,7 @@ void noc_semaphore_set_remote(std::uint32_t src_local_l1_addr, std::uint64_t dst * there is no restriction on the number of destinations, i.e. the * multicast destinations can span the full chip. However, as mentioned * previously, the multicast source cannot be part of the destinations. So, the - * maximum number of destinations is 119. + * maximum number of destinations is number of cores - 1. * * Return value: None * @@ -1343,8 +1368,8 @@ void noc_semaphore_set_remote(std::uint32_t src_local_l1_addr, std::uint64_t dst * |------------------------|--------------------------------------------------------------------------|----------|---------------------------------------------------------------|----------| * | src_local_l1_addr | Source address in local L1 memory | uint32_t | 0..1MB | True | * | dst_noc_addr_multicast | Encoding of the destinations nodes (x_start,y_start,x_end,y_end)+address | uint64_t | DOX-TODO(insert a reference to what constitutes valid coords) | True | - * | size | Size of data transfer in bytes | uint32_t | 0..1MB | True | - * | num_dests | Number of destinations that the multicast source is targetting | uint32_t | 0..119 | True | + * | size | Size of data transfer in bytes | uint32_t | 0..1MB | True | + * | num_dests | Number of destinations that the multicast source is targetting | uint32_t | 0..(number of cores -1) | True | */ template inline @@ -1398,7 +1423,7 @@ void noc_async_write_multicast( * |------------------------|--------------------------------------------------------------------------|----------|-----------------------------------------------------------|----------| * | src_local_l1_addr | Source address in local L1 memory | uint32_t | 0..1MB | True | * | dst_noc_addr_multicast | Encoding of the destinations nodes (x_start,y_start,x_end,y_end)+address | uint64_t | DOX-TODO(insert a reference to what constitutes valid coords) | True | - * | num_dests | Number of destinations that the multicast source is targetting | uint32_t | 0..119 | True | + * | num_dests | Number of destinations that the multicast source is targetting | uint32_t | 0..(number of cores - 1) | True | */ inline void noc_semaphore_set_multicast( @@ -1440,7 +1465,7 @@ void noc_semaphore_set_multicast( * |------------------------|--------------------------------------------------------------------------|----------|-----------------------------------------------------------|----------| * | src_local_l1_addr | Source address in local L1 memory | uint32_t | 0..1MB | True | * | dst_noc_addr_multicast | Encoding of the destinations nodes (x_start,y_start,x_end,y_end)+address | uint64_t | DOX-TODO(insert a reference to what constitutes valid coords) | True | - * | num_dests | Number of destinations that the multicast source is targetting | uint32_t | 0..119 | True | + * | num_dests | Number of destinations that the multicast source is targetting | uint32_t | 0..(number of cores) | True | */ inline void noc_semaphore_set_multicast_loopback_src( @@ -1486,6 +1511,72 @@ void noc_async_write_multicast_loopback_src( WAYPOINT("NMLD"); } +/** + * Initiates an asynchronous write from a source address in L1 memory on the + * Tensix core executing this function call to an L-shaped destination which is defined by + * a grid and an exclusion zone. + * The destinations are specified using a uint64_t encoding referencing an + * on-chip grid of nodes located at NOC coordinate range + * (x_start,y_start,x_end,y_end) and a local address created using + * *get_noc_multicast_addr* function. Also, *see noc_async_write_barrier*. + * Similarly, the exclusion zone is specified using uint32_t encoding referencing + * an on-chip core and directions relative to it created using *get_noc_exclude_region* function. + * + * The destination nodes can only be a set of Tensix cores + L1 memory address. + * The destination nodes must form an L-shaped grid (where dst_noc_addr_multicast defines a grid + * and exclude_region define a subgrid to exclude, the inner part of the L). The destination L1 + * memory address must be the same on all destination nodes. + * + * With this API, the multicast sender cannot be part of the multicast + * destinations. + * + * Note: The number of destinations needs to be non-zero. Besides that, + * there is no restriction on the number of destinations, i.e. the + * multicast destinations can span the full chip. However, as mentioned + * previously, the multicast source cannot be part of the destinations. So, the + * maximum number of destinations is number of cores - 1. + * + * Return value: None + * + * NOTE: only supported on Blackhole + * + * | Argument | Description | Type | Valid Range | Required | + * |------------------------|--------------------------------------------------------------------------|----------|---------------------------------------------------------------|----------| + * | src_local_l1_addr | Source address in local L1 memory | uint32_t | 0..1MB | True | + * | dst_noc_addr_multicast | Encoding of the destinations nodes (x_start,y_start,x_end,y_end)+address | uint64_t | DOX-TODO(insert a reference to what constitutes valid coords) | True | + * | size | Size of data transfer in bytes | uint32_t | 0..1MB | True | + * | num_dests | Number of destinations that the multicast source is targetting | uint32_t | 0..(number of cores - 1) | True | + * | exclude_region | Encoding of the excluded regin (x_start,y_start,x_direction,y_direction) | uint32_t | DOX-TODO(insert a reference to what constitutes valid coords) | True | + */ +#ifdef ARCH_BLACKHOLE +inline +void noc_async_write_multicast_exclude_region( + std::uint32_t src_local_l1_addr, + std::uint64_t dst_noc_addr_multicast, + std::uint32_t size, + std::uint32_t num_dests, + std::uint32_t exclude_region, + bool linked = false, + bool multicast_path_reserve = true, + uint8_t noc = noc_index) { + WAYPOINT("NMEW"); + DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(noc, dst_noc_addr_multicast, src_local_l1_addr, size); + ncrisc_noc_fast_write_any_len_exclude_region( + noc, + write_cmd_buf, + src_local_l1_addr, + dst_noc_addr_multicast, + size, + NOC_MULTICAST_WRITE_VC, + true, + linked, + num_dests, + multicast_path_reserve, + exclude_region); + WAYPOINT("NMED"); +} +#endif + /** * This blocking call waits for all the outstanding enqueued *noc_async_read* * calls issued on the current Tensix core to complete. After returning from