From 2804a357dbc76671407453ed496857348580db7a Mon Sep 17 00:00:00 2001 From: Erika Hunhoff Date: Thu, 24 Oct 2024 11:41:10 -0600 Subject: [PATCH 1/6] Add some stricted checks for sizes/strides, as used by npu_dma_memcpy_nd --- lib/Dialect/AIEX/IR/AIEXDialect.cpp | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index 12b89aad32..11be552fd8 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -189,22 +189,32 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType, return success(); } - for (int i = 0; i < 3; i++) { + // Don't check inputStrides[3] unless memtiles as it may be 1 to signify + // repeat count if shim tile or compute tile + int stridesToCheck = targetModel.isMemTile(tileCol, tileRow) ? 4 : 3; + + for (int i = 0; i < stridesToCheck; i++) { if (inputSizes[i] > 1 && inputStrides[i] < 1) { // If inputSize[i] == 1, anything is allowable in the stride, since that // stride will never be applied. For any larger size, we must verify that // the stride is positive. return forOp->emitOpError("Stride ") << i << " must be a positive integer."; + } else if (inputSizes[i] == 1 && inputStrides[i] != 0) { + forOp->emitWarning("Non-zero stride specified for size of 1; " + "this stride value will have no effect"); } } - // A value of zero is allowable for the fourth-dimension stride, as such a - // "repeat" can be accomplished by setting size==1 and repeat_count=size. - if (inputSizes[3] > 1 && inputStrides[3] < 0) { - return forOp->emitOpError("Stride 3 must be a non-negative integer."); + // A value of zero is allowable for the last-dimension stride, as such a + // "repeat" can be accomplished by setting strides[n]=1 and + // size[n]=repeat_count. + if (!targetModel.isMemTile(tileCol, tileRow) && + inputSizes[stridesToCheck] > 1 && inputStrides[stridesToCheck] != 1) { + return forOp->emitOpError( + "Highest stride must be 1 to specify a repeat count."); } - for (int i = 0; i < 4; i++) { + for (int i = 0; i < stridesToCheck; i++) { // strides[0] == 1 is ok iff the transfer size is a multiple of // addressGranularity, which is checked below if (i == 0 && inputStrides[i] == 1) From 4ad022b4b1e570116916472cde022e00576d314c Mon Sep 17 00:00:00 2001 From: Erika Hunhoff Date: Thu, 24 Oct 2024 12:30:38 -0600 Subject: [PATCH 2/6] Restore checks to previous state --- lib/Dialect/AIEX/IR/AIEXDialect.cpp | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index 11be552fd8..12b89aad32 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -189,32 +189,22 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType, return success(); } - // Don't check inputStrides[3] unless memtiles as it may be 1 to signify - // repeat count if shim tile or compute tile - int stridesToCheck = targetModel.isMemTile(tileCol, tileRow) ? 4 : 3; - - for (int i = 0; i < stridesToCheck; i++) { + for (int i = 0; i < 3; i++) { if (inputSizes[i] > 1 && inputStrides[i] < 1) { // If inputSize[i] == 1, anything is allowable in the stride, since that // stride will never be applied. For any larger size, we must verify that // the stride is positive. return forOp->emitOpError("Stride ") << i << " must be a positive integer."; - } else if (inputSizes[i] == 1 && inputStrides[i] != 0) { - forOp->emitWarning("Non-zero stride specified for size of 1; " - "this stride value will have no effect"); } } - // A value of zero is allowable for the last-dimension stride, as such a - // "repeat" can be accomplished by setting strides[n]=1 and - // size[n]=repeat_count. - if (!targetModel.isMemTile(tileCol, tileRow) && - inputSizes[stridesToCheck] > 1 && inputStrides[stridesToCheck] != 1) { - return forOp->emitOpError( - "Highest stride must be 1 to specify a repeat count."); + // A value of zero is allowable for the fourth-dimension stride, as such a + // "repeat" can be accomplished by setting size==1 and repeat_count=size. + if (inputSizes[3] > 1 && inputStrides[3] < 0) { + return forOp->emitOpError("Stride 3 must be a non-negative integer."); } - for (int i = 0; i < stridesToCheck; i++) { + for (int i = 0; i < 4; i++) { // strides[0] == 1 is ok iff the transfer size is a multiple of // addressGranularity, which is checked below if (i == 0 && inputStrides[i] == 1) From c9f586b350e66006334204592e7c05ff4caf382c Mon Sep 17 00:00:00 2001 From: Erika Hunhoff Date: Thu, 24 Oct 2024 12:44:48 -0600 Subject: [PATCH 3/6] Try to clarify documentation on repeat_count/iteration stride --- include/aie/Dialect/AIE/IR/AIEOps.td | 4 ++++ lib/Dialect/AIEX/IR/AIEXDialect.cpp | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td index bcff1fd713..21670b6827 100644 --- a/include/aie/Dialect/AIE/IR/AIEOps.td +++ b/include/aie/Dialect/AIE/IR/AIEOps.td @@ -857,6 +857,10 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", []> { // access/store element at/to index (i * 16 /*stride_2*/ + j * 1 /*stride_1*/ + k * 2 /*stride_0*/) ``` + Note that an additional dimension of sizes/strides is accepted (5th dimension for memtiles, 4th otherwise); + the additional size value is interpreted as a repeat count whereas the additional stride value is + interpreted as an iteration stride. + #### Important gotcha regarding strides All strides are expressed in multiples of the element width (just like `len` and `offset`) diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index 12b89aad32..df1088e817 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -198,8 +198,8 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType, << i << " must be a positive integer."; } } - // A value of zero is allowable for the fourth-dimension stride, as such a - // "repeat" can be accomplished by setting size==1 and repeat_count=size. + // A value of zero is allowable for the fourth-dimension stride + // (this indicates an interation stride for the repeat of 0) if (inputSizes[3] > 1 && inputStrides[3] < 0) { return forOp->emitOpError("Stride 3 must be a non-negative integer."); } From 833e19062355255252d388f446f04a184365ac84 Mon Sep 17 00:00:00 2001 From: Erika Hunhoff Date: Thu, 24 Oct 2024 15:10:30 -0600 Subject: [PATCH 4/6] update documentation --- .../basic/matrix_multiplication/whole_array/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/programming_examples/basic/matrix_multiplication/whole_array/README.md b/programming_examples/basic/matrix_multiplication/whole_array/README.md index 486b008f43..2e57cb5a87 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/README.md +++ b/programming_examples/basic/matrix_multiplication/whole_array/README.md @@ -207,7 +207,6 @@ The signature of the `aie.runtime_sequence()` operation lists as its arguments a * For each `tile_row` in the current row block: * The DMA transfer function `npu_dma_memcpy_nd` loads a segment of matrix A and matrix B data (submatrix a, submatrix b) from the host into the corresponding `inA_fifos` for the respective column, maintaining the appropriate strides and offsets. * Analogously to the data layout transformations described [further above](#tiling-and-data-layout-transformations) to translate a `m`×`k` matrix into blocks of `r`×`s`-submatrices, this transfer translates the input `M`×`K` and `K`×`N` matrices into submatrices of size `m`×`k` and `k`×`n`. - > Note that data layout transformations in the `npu_dma_memcpy_nd` operation are expressed in units of 4 bytes. This is why you will see all strides and the lowest-dimension length multiplied by a factor of `word_size_in` or `word_size_out` (to get the size in bytes) and then divided by four (to get the size in units of 4 bytes). This discrepancy will be streamlined in future versions. * The DMA transfer function `npu_dma_memcpy_nd` sends a segment of matrix C data (submatrix c) from the corresponding `outC_fifos` for the respective column, back to the host while maintaining the appropriate strides and offsets. * After completing DMA transfers for each column, `dma_wait` is used to synchronize their completion. From 6d065bb84680e22b190dce07548e35a68706c902 Mon Sep 17 00:00:00 2001 From: Erika Hunhoff Date: Thu, 24 Oct 2024 15:40:04 -0600 Subject: [PATCH 5/6] Allow repeat count with linear transfer --- lib/Dialect/AIEX/IR/AIEXDialect.cpp | 18 +-- lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp | 29 ++--- test/npu-xrt/nd_memcpy_linear_repeat/aie2.py | 77 +++++++++++ test/npu-xrt/nd_memcpy_linear_repeat/test.cpp | 122 ++++++++++++++++++ 4 files changed, 222 insertions(+), 24 deletions(-) create mode 100644 test/npu-xrt/nd_memcpy_linear_repeat/aie2.py create mode 100644 test/npu-xrt/nd_memcpy_linear_repeat/test.cpp diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index df1088e817..9be46cadef 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -185,10 +185,6 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType, return forOp->emitOpError(msg.str()); } - if (skipTransformationChecks) { - return success(); - } - for (int i = 0; i < 3; i++) { if (inputSizes[i] > 1 && inputStrides[i] < 1) { // If inputSize[i] == 1, anything is allowable in the stride, since that @@ -204,6 +200,10 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType, return forOp->emitOpError("Stride 3 must be a non-negative integer."); } + if (skipTransformationChecks) { + return success(); + } + for (int i = 0; i < 4; i++) { // strides[0] == 1 is ok iff the transfer size is a multiple of // addressGranularity, which is checked below @@ -322,9 +322,10 @@ int64_t AIEX::NpuDmaMemcpyNdOp::getOffsetInBytes() { return offset; } -// dma_memcpy_nd transfers of the form [1, 1, 1, len][0, 0, 0, 1] do not +// dma_memcpy_nd transfers of the form [*, 1, 1, len][*, 0, 0, 1] do not // specify any data layout transformation, but simply express a contiguous -// transfer of `len`. +// transfer of `len`. We exclude checks to 4th dimension, because repeat count +// is still possible without a data layout transformation. bool AIEX::NpuDmaMemcpyNdOp::isLinearTransferWithoutTransformation() { llvm::SmallVector inputSizes = llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) { @@ -334,9 +335,8 @@ bool AIEX::NpuDmaMemcpyNdOp::isLinearTransferWithoutTransformation() { llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) { return getConstantIntValue(s).value(); }); - return (inputSizes[1] == 1 && inputSizes[2] == 1 && inputSizes[3] == 1 && - inputStrides[0] == 1 && inputStrides[1] == 0 && - inputStrides[2] == 0 && inputStrides[3] == 0); + return (inputSizes[1] == 1 && inputSizes[2] == 1 && inputStrides[0] == 1 && + inputStrides[1] == 0 && inputStrides[2] == 0); } LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() { diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index b9390d6e59..ca6f7f1a69 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -413,23 +413,22 @@ struct DmaToNpuPattern : OpConversionPattern { // d2_stride d2_stride = IntegerAttr::get(i32ty, strides[2]); - - // iteration_current, iteration_size, iteration_stride, repeat_count - if (inputSizes[3] > 1) { - if (inputStrides[3] > 0) { - iteration_size = IntegerAttr::get(i32ty, sizes[3]); - iteration_stride = IntegerAttr::get(i32ty, strides[3]); - } else { - // We allow users to encode the repeat_count as a dimension 3 stride - // of 0. This must lower to a iteration wrap of 0, so no stride is - // ever added. We then repeat the BD using the repeat_count in - // NpuPushQueueOp. - iteration_size = zero; - iteration_stride = zero; - } + } + // iteration_current, iteration_size, iteration_stride, repeat_count + if (inputSizes[3] > 1) { + if (inputStrides[3] > 0) { + iteration_size = IntegerAttr::get(i32ty, sizes[3]); + iteration_stride = IntegerAttr::get(i32ty, strides[3]); + } else { + // We allow users to encode the repeat_count as a dimension 3 stride + // of 0. This must lower to a iteration wrap of 0, so no stride is + // ever added. We then repeat the BD using the repeat_count in + // NpuPushQueueOp. + iteration_size = zero; + iteration_stride = zero; } - repeat_count = IntegerAttr::get(i32ty, sizes[3]); } + repeat_count = IntegerAttr::get(i32ty, sizes[3]); // next_bd diff --git a/test/npu-xrt/nd_memcpy_linear_repeat/aie2.py b/test/npu-xrt/nd_memcpy_linear_repeat/aie2.py new file mode 100644 index 0000000000..b72a3bffae --- /dev/null +++ b/test/npu-xrt/nd_memcpy_linear_repeat/aie2.py @@ -0,0 +1,77 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 AMD Inc. + +# REQUIRES: ryzen_ai, valid_xchess_license +# +# RUN: %python %S/aie2.py > ./aie2.mlir +# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir +# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags +# RUN: %run_on_npu ./test.exe | FileCheck %s +# CHECK: PASS! + +import numpy as np +from aie.extras.context import mlir_mod_ctx + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.helpers.dialects.ext.scf import _for as range_ + +dtype = np.int16 +repeat_count = 3 +a_len = 2048 +c_len = a_len * repeat_count + + +def design(): + + with mlir_mod_ctx() as ctx: + + @device(AIEDevice.npu1_4col) + def device_body(): + a_ty = np.ndarray[(a_len,), np.dtype[dtype]] + c_ty = np.ndarray[(c_len,), np.dtype[dtype]] + + ShimTile = tile(0, 0) + ComputeTile = tile(0, 2) + fifo_a = object_fifo("fifo_a", ShimTile, ComputeTile, 2, a_ty) + fifo_c = object_fifo("fifo_c", ComputeTile, ShimTile, 2, a_ty) + + # Core + @core(ComputeTile) + def core_body(): + for _ in range_(0, 0xFFFFFFFF): + for i in range_(repeat_count): + elem_c = fifo_c.acquire(ObjectFifoPort.Produce, 1) + elem_a = fifo_a.acquire(ObjectFifoPort.Consume, 1) + for i in range_(a_len): + elem_c[i] = elem_a[i] + fifo_a.release(ObjectFifoPort.Consume, 1) + fifo_c.release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + @runtime_sequence(a_ty, a_ty, c_ty) + def sequence(A, _B, C): + npu_dma_memcpy_nd( + metadata=fifo_a, + bd_id=1, + mem=A, + sizes=[repeat_count, 1, 1, a_len], + strides=[0, 0, 0, 1], + ) + npu_dma_memcpy_nd( + metadata=fifo_c, + bd_id=0, + mem=C, + sizes=[1, 1, 1, c_len], + strides=[0, 0, 0, 1], + ) + dma_wait(fifo_c) + + print(ctx.module) + + +design() diff --git a/test/npu-xrt/nd_memcpy_linear_repeat/test.cpp b/test/npu-xrt/nd_memcpy_linear_repeat/test.cpp new file mode 100644 index 0000000000..64a3be5b61 --- /dev/null +++ b/test/npu-xrt/nd_memcpy_linear_repeat/test.cpp @@ -0,0 +1,122 @@ +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 AMD Inc. + +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#include "test_utils.h" + +#ifndef XCLBIN +#define XCLBIN "final.xclbin" +#endif + +#ifndef INSTS_TXT +#define INSTS_TXT "insts.txt" +#endif + +#ifndef KERNEL_NAME +#define KERNEL_NAME "MLIR_AIE" +#endif + +#define DTYPE int16_t +#define A_DATATYPE DTYPE +#define C_DATATYPE DTYPE + +#define A_LEN 2048 +#define REPEAT_COUNT 3 +#define C_LEN (A_LEN * REPEAT_COUNT) + +#define A_SIZE (A_LEN * sizeof(A_DATATYPE)) // in bytes +#define B_SIZE A_SIZE // in bytes +#define C_SIZE (C_LEN * sizeof(C_DATATYPE)) // in bytes + +int main(int argc, const char *argv[]) { + + std::vector instr_v = test_utils::load_instr_sequence(INSTS_TXT); + assert(instr_v.size() > 0); + + // Get a device handle + unsigned int device_index = 0; + xrt::device device = xrt::device(device_index); + + // Load the xclbin + xrt::xclbin xclbin = xrt::xclbin(XCLBIN); + + // Get the kernel from the xclbin + std::vector xkernels = xclbin.get_kernels(); + xrt::xclbin::kernel xkernel = *std::find_if( + xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) { + return k.get_name().rfind(KERNEL_NAME, 0) == 0; + }); + std::string kernel_name = xkernel.get_name(); + assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0); + + device.register_xclbin(xclbin); + + // get a hardware context + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + auto kernel = xrt::kernel(context, kernel_name); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_a = + xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_b = + xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_c = + xrt::bo(device, C_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + + A_DATATYPE *buf_a = bo_a.map(); + for (int i = 0; i < A_SIZE / sizeof(buf_a[0]); i++) { + buf_a[i] = 2 * i; // even + } + C_DATATYPE *buf_c = bo_c.map(); + memset(buf_c, 0, C_SIZE); + + // Instruction buffer for DMA configuration + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_c.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_c); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + int errors = 0; + for (int i = 0; i < C_SIZE / sizeof(buf_c[0]); i++) { + std::cout << std::setw(4) << (long)buf_c[i] << " "; + if (buf_c[i] != buf_a[i % A_LEN]) { + errors += 1; + } + } + std::cout << std::endl; + + if (errors == 0) { + std::cout << "PASS!" << std::endl; + } else { + std::cout << "FAIL." << std::endl; + } + + return 0; +} From 4c54bd3b8c2f78f617e6b9b51e8da34905ea03cc Mon Sep 17 00:00:00 2001 From: Erika Hunhoff Date: Thu, 24 Oct 2024 16:01:58 -0600 Subject: [PATCH 6/6] Do more checks even when skipTransformationChecks==true --- lib/Dialect/AIEX/IR/AIEXDialect.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index 9be46cadef..5ed2a2cd63 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -200,10 +200,6 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType, return forOp->emitOpError("Stride 3 must be a non-negative integer."); } - if (skipTransformationChecks) { - return success(); - } - for (int i = 0; i < 4; i++) { // strides[0] == 1 is ok iff the transfer size is a multiple of // addressGranularity, which is checked below @@ -219,7 +215,7 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType, } } - if (hardwareSizes[0] > (1 << wrap_bits) - 1) + if (!skipTransformationChecks && hardwareSizes[0] > (1 << wrap_bits) - 1) return forOp->emitOpError( "Size 0 exceeds the [0:" + std::to_string((1 << wrap_bits) - 1) + "] range.");