Skip to content

Commit

Permalink
#13637: Enable L-shape multicast API and add tests (#14309)
Browse files Browse the repository at this point in the history
* #13637: Adding L-shape multicast - low level API, test. Still WIP

* #13637: Added L-shape multicast API and tests

* #13637: update docs and not limit tests to slow dispatch

* #13637: fix pre-commit failures

* #13637: clean up debug code

* #13637: fix noc 1 core calculations, wrap BH only code with #ifdef

* #13637: internally handle noc1 exclusion direction, update API method name

* #0: fix typo
  • Loading branch information
arikTT authored Nov 1, 2024
1 parent 5de2817 commit a0589f1
Show file tree
Hide file tree
Showing 4 changed files with 296 additions and 14 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include "dataflow_api.h"

void kernel_main() {
uint32_t src_addr = get_arg_val<uint32_t>(0);
uint32_t src_noc_x = get_arg_val<uint32_t>(1);
uint32_t src_noc_y = get_arg_val<uint32_t>(2);
uint32_t src_buffer_size = get_arg_val<uint32_t>(3);

uint32_t local_addr = get_arg_val<uint32_t>(4);

uint32_t dst_addr = get_arg_val<uint32_t>(5);
uint32_t dst_noc_x_start = get_arg_val<uint32_t>(6);
uint32_t dst_noc_y_start = get_arg_val<uint32_t>(7);
uint32_t dst_noc_x_end = get_arg_val<uint32_t>(8);
uint32_t dst_noc_y_end = get_arg_val<uint32_t>(9);
uint32_t num_dests = get_arg_val<uint32_t>(10);
uint32_t exclude_start_x = get_arg_val<uint32_t>(11);
uint32_t exclude_start_y = get_arg_val<uint32_t>(12);
uint32_t exclude_dir_x = get_arg_val<uint32_t>(13);
uint32_t exclude_dir_y = get_arg_val<uint32_t>(14);


// Read src buffer into local L1 buffer
uint64_t src_buffer_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr);
noc_async_read(src_buffer_noc_addr, local_addr, src_buffer_size);
noc_async_read_barrier();

// multicast local L1 buffer to all destination cores
uint64_t dst_noc_multicast_addr = get_noc_multicast_addr(
dst_noc_x_start,
dst_noc_y_start,
dst_noc_x_end,
dst_noc_y_end,
dst_addr);
uint32_t noc_exclude_region = get_noc_exclude_region(
exclude_start_x,
exclude_start_y,
exclude_dir_x,
exclude_dir_y);
noc_async_write_multicast_exclude_region(local_addr, dst_noc_multicast_addr, src_buffer_size, num_dests, noc_exclude_region);
noc_async_write_barrier();
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ struct DRAMtoL1MulticastConfig{
std::uint32_t dest_buffer_addr;
std::uint32_t target_grid_offset;
std::string kernel_file;
CoreCoord exclude_start;
CoreCoord exclude_direction;
};

bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, const DRAMtoL1MulticastConfig &cfg){
Expand Down Expand Up @@ -53,6 +55,14 @@ bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, cons
CoreCoord core_end = {core_start.x + (grid_size.x - 1), core_start.y + (grid_size.y - 1)};
auto core_start_physical = device->worker_core_from_logical_core(core_start);
auto core_end_physical = device->worker_core_from_logical_core(core_end);
auto core_exclude_physical = device->worker_core_from_logical_core(cfg.exclude_start);
auto num_dests = (grid_size.x * grid_size.y) - cfg.target_grid_offset;
// calculate number of destination cores, taking exluded ones into account
if (cfg.exclude_start.x != 0 || cfg.exclude_start.y != 0) {
auto num_x = cfg.exclude_direction.x == 1 ? grid_size.x - cfg.exclude_start.x : cfg.exclude_start.x + 1;
auto num_y = cfg.exclude_direction.y == 1 ? grid_size.y - cfg.exclude_start.y : cfg.exclude_start.y + 1;
num_dests = (grid_size.x * grid_size.y) - num_x * num_y - cfg.target_grid_offset;
}
std::vector<uint32_t> mcast_reader_args = {
(std::uint32_t)dram_buffer_addr,
(std::uint32_t)dram_noc_xy.x,
Expand All @@ -64,10 +74,15 @@ bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, cons
(std::uint32_t)core_end_physical.y,
(std::uint32_t)core_start_physical.x,
(std::uint32_t)core_start_physical.y,
(std::uint32_t)(grid_size.x * grid_size.y) - cfg.target_grid_offset}; // Note: exclude src from acks, since we are not setting NOC_CMD_BRCST_SRC_INCLUDE
(std::uint32_t)num_dests,
(std::uint32_t)core_exclude_physical.x,
(std::uint32_t)core_exclude_physical.y,
(std::uint32_t)cfg.exclude_direction.x,
(std::uint32_t)cfg.exclude_direction.y,}; // Note: exclude src from acks, since we are not setting NOC_CMD_BRCST_SRC_INCLUDE

log_debug(LogTest, "Start = {}, {}", core_start_physical.x, core_start_physical.y);
log_debug(LogTest, "End = {}, {}", core_end_physical.x, core_end_physical.y);
log_debug(LogTest, "Exclude = {}, {}", core_exclude_physical.x, core_exclude_physical.y);
auto mcast_reader_kernel = tt_metal::CreateKernel(
program,
cfg.kernel_file,
Expand All @@ -87,6 +102,12 @@ bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, cons

for(int i = 0 ; i < grid_size.y; i++) {
for(int j = 0 ; j < grid_size.x; j++) {
// don't compare on skipped cores
if ( ((cfg.exclude_direction.x == 0 && j <= cfg.exclude_start.x) || (cfg.exclude_direction.x == 1 && j >= cfg.exclude_start.x)) &&
((cfg.exclude_direction.y == 0 && i <= cfg.exclude_start.y) || (cfg.exclude_direction.y == 1 && i >= cfg.exclude_start.y))) {
tt::log_debug(tt::LogTest, "Skipping core {},{}", j, i); // debug print to verify we don't skip unnecessary cores
continue;
}
CoreCoord dest_core = {(std::size_t) core_start.x + j, (std::size_t) core_start.y + i};
std::vector<uint32_t> dest_core_data;
tt_metal::detail::ReadFromDeviceL1(device, dest_core, dest_buffer_addr, dram_buffer_size, dest_core_data);
Expand All @@ -103,10 +124,6 @@ bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, cons
}

TEST_F(CommonFixture, DRAMtoL1Multicast){
if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){
tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode");
GTEST_SKIP();
}
unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = {
.dest_buffer_addr = 200 * 1024,
.target_grid_offset = 1,
Expand All @@ -117,10 +134,6 @@ TEST_F(CommonFixture, DRAMtoL1Multicast){
}
}
TEST_F(CommonFixture, DRAMtoL1MulticastLoopbackSrc){
if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){
tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode");
GTEST_SKIP();
}
unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = {
.dest_buffer_addr = 500 * 1024,
.target_grid_offset = 0,
Expand All @@ -130,3 +143,70 @@ TEST_F(CommonFixture, DRAMtoL1MulticastLoopbackSrc){
ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config));
}
}
TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionUpLeft){
unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = {
.dest_buffer_addr = 200 * 1024,
.target_grid_offset = 0, //source core is in exclusion zone, don't count twice
.kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp",
.exclude_start = {10, 6},
.exclude_direction = {0, 0}
};
for (unsigned int id=0; id < devices_.size(); id++){
if (!(this->devices_.at(id)->arch() == tt::ARCH::BLACKHOLE)){
tt::log_info(tt::LogTest, "This test is only supported on Blackhole");
GTEST_SKIP();
}
ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config));
}
}

TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionUpRight){
unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = {
.dest_buffer_addr = 200 * 1024,
.target_grid_offset = 1,
.kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp",
.exclude_start = {10, 6},
.exclude_direction = {1, 0}
};
for (unsigned int id=0; id < devices_.size(); id++){
if (!(this->devices_.at(id)->arch() == tt::ARCH::BLACKHOLE)){
tt::log_info(tt::LogTest, "This test is only supported on Blackhole");
GTEST_SKIP();
}
ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config));
}
}

TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionDownLeft){
unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = {
.dest_buffer_addr = 200 * 1024,
.target_grid_offset = 1,
.kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp",
.exclude_start = {10, 6},
.exclude_direction = {0, 1}
};
for (unsigned int id=0; id < devices_.size(); id++){
if (!(this->devices_.at(id)->arch() == tt::ARCH::BLACKHOLE)){
tt::log_info(tt::LogTest, "This test is only supported on Blackhole");
GTEST_SKIP();
}
ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config));
}
}

TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionDownRight){
unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = {
.dest_buffer_addr = 200 * 1024,
.target_grid_offset = 1,
.kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp",
.exclude_start = {10, 6},
.exclude_direction = {1, 1}
};
for (unsigned int id=0; id < devices_.size(); id++){
if (!(this->devices_.at(id)->arch() == tt::ARCH::BLACKHOLE)){
tt::log_info(tt::LogTest, "This test is only supported on Blackhole");
GTEST_SKIP();
}
ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config));
}
}
65 changes: 65 additions & 0 deletions tt_metal/hw/inc/blackhole/noc_nonblocking_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write(
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, (uint32_t)dest_addr);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_MID, (uint32_t)(dest_addr >> 32) & 0x1000000F);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_COORDINATE, (uint32_t)(dest_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_BRCST_EXCLUDE, 0);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
if (posted) {
Expand Down Expand Up @@ -146,12 +147,41 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_loopback_src(
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, (uint32_t)dest_addr);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_MID, (uint32_t)(dest_addr >> 32) & 0x1000000F);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_COORDINATE, (uint32_t)(dest_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_BRCST_EXCLUDE, 0);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
noc_nonposted_writes_num_issued[noc] += 1;
noc_nonposted_writes_acked[noc] += num_dests;
}

inline __attribute__((always_inline)) void ncrisc_noc_fast_write_exclude_region(
uint32_t noc,
uint32_t cmd_buf,
uint32_t src_addr,
uint64_t dest_addr,
uint32_t len_bytes,
uint32_t vc,
bool mcast,
bool linked,
uint32_t num_dests,
bool multicast_path_reserve,
uint32_t exclude_region) {
uint32_t noc_cmd_field =
NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc) | (linked ? NOC_CMD_VC_LINKED : 0x0) |
(mcast ? ((multicast_path_reserve ? NOC_CMD_PATH_RESERVE : 0) | NOC_CMD_BRCST_PACKET) : 0x0) | NOC_CMD_RESP_MARKED;

NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CTRL, noc_cmd_field);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_LO, src_addr);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, (uint32_t)dest_addr);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_MID, (uint32_t)(dest_addr >> 32) & 0x1000000F);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_COORDINATE, (uint32_t)(dest_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_BRCST_EXCLUDE, exclude_region);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes);
NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
noc_nonposted_writes_num_issued[noc] += 1;
noc_nonposted_writes_acked[noc] += num_dests;
}

inline __attribute__((always_inline)) void ncrisc_noc_blitz_write_setup(
uint32_t noc, uint32_t cmd_buf, uint64_t dest_addr, uint32_t len_bytes, uint32_t vc, uint32_t num_times_to_write) {
uint32_t noc_cmd_field = NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc) | NOC_CMD_RESP_MARKED;
Expand Down Expand Up @@ -343,6 +373,41 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_loopbac
noc, cmd_buf, src_addr, dest_addr, len_bytes, vc, mcast, linked, num_dests, multicast_path_reserve);
}

inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_exclude_region(
uint32_t noc,
uint32_t cmd_buf,
uint32_t src_addr,
uint64_t dest_addr,
uint32_t len_bytes,
uint32_t vc,
bool mcast,
bool linked,
uint32_t num_dests,
bool multicast_path_reserve,
uint32_t exclude_region = 0) {
while (len_bytes > NOC_MAX_BURST_SIZE) {
while (!noc_cmd_buf_ready(noc, cmd_buf));
ncrisc_noc_fast_write_exclude_region(
noc,
cmd_buf,
src_addr,
dest_addr,
NOC_MAX_BURST_SIZE,
vc,
mcast,
linked,
num_dests,
multicast_path_reserve,
exclude_region);
src_addr += NOC_MAX_BURST_SIZE;
dest_addr += NOC_MAX_BURST_SIZE;
len_bytes -= NOC_MAX_BURST_SIZE;
}
while (!noc_cmd_buf_ready(noc, cmd_buf));
ncrisc_noc_fast_write_exclude_region(
noc, cmd_buf, src_addr, dest_addr, len_bytes, vc, mcast, linked, num_dests, multicast_path_reserve, exclude_region);
}

inline __attribute__((always_inline)) void noc_fast_write_dw_inline(
uint32_t noc,
uint32_t cmd_buf,
Expand Down
Loading

0 comments on commit a0589f1

Please sign in to comment.