Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Linear transfer without transformation but with repeat #1882

Merged
merged 7 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions include/aie/Dialect/AIE/IR/AIEOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,10 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", []> {
// access/store element at/to index (i * 16 /*stride_2*/ + j * 1 /*stride_1*/ + k * 2 /*stride_0*/)
```

Note that an additional dimension of sizes/strides is accepted (5th dimension for memtiles, 4th otherwise);
the additional size value is interpreted as a repeat count whereas the additional stride value is
interpreted as an iteration stride.

#### Important gotcha regarding strides

All strides are expressed in multiples of the element width (just like `len` and `offset`)
Expand Down
20 changes: 8 additions & 12 deletions lib/Dialect/AIEX/IR/AIEXDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,6 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
return forOp->emitOpError(msg.str());
}

if (skipTransformationChecks) {
return success();
}

for (int i = 0; i < 3; i++) {
if (inputSizes[i] > 1 && inputStrides[i] < 1) {
// If inputSize[i] == 1, anything is allowable in the stride, since that
Expand All @@ -198,8 +194,8 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
<< i << " must be a positive integer.";
}
}
// A value of zero is allowable for the fourth-dimension stride, as such a
// "repeat" can be accomplished by setting size==1 and repeat_count=size.
// A value of zero is allowable for the fourth-dimension stride
// (this indicates an interation stride for the repeat of 0)
if (inputSizes[3] > 1 && inputStrides[3] < 0) {
return forOp->emitOpError("Stride 3 must be a non-negative integer.");
}
Expand All @@ -219,7 +215,7 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
}
}

if (hardwareSizes[0] > (1 << wrap_bits) - 1)
if (!skipTransformationChecks && hardwareSizes[0] > (1 << wrap_bits) - 1)
return forOp->emitOpError(
"Size 0 exceeds the [0:" + std::to_string((1 << wrap_bits) - 1) +
"] range.");
Expand Down Expand Up @@ -322,9 +318,10 @@ int64_t AIEX::NpuDmaMemcpyNdOp::getOffsetInBytes() {
return offset;
}

// dma_memcpy_nd transfers of the form [1, 1, 1, len][0, 0, 0, 1] do not
// dma_memcpy_nd transfers of the form [*, 1, 1, len][*, 0, 0, 1] do not
// specify any data layout transformation, but simply express a contiguous
// transfer of `len`.
// transfer of `len`. We exclude checks to 4th dimension, because repeat count
// is still possible without a data layout transformation.
bool AIEX::NpuDmaMemcpyNdOp::isLinearTransferWithoutTransformation() {
llvm::SmallVector<int64_t, 4> inputSizes =
llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) {
Expand All @@ -334,9 +331,8 @@ bool AIEX::NpuDmaMemcpyNdOp::isLinearTransferWithoutTransformation() {
llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
return getConstantIntValue(s).value();
});
return (inputSizes[1] == 1 && inputSizes[2] == 1 && inputSizes[3] == 1 &&
inputStrides[0] == 1 && inputStrides[1] == 0 &&
inputStrides[2] == 0 && inputStrides[3] == 0);
return (inputSizes[1] == 1 && inputSizes[2] == 1 && inputStrides[0] == 1 &&
inputStrides[1] == 0 && inputStrides[2] == 0);
}

LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
Expand Down
29 changes: 14 additions & 15 deletions lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -413,23 +413,22 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {

// d2_stride
d2_stride = IntegerAttr::get(i32ty, strides[2]);

// iteration_current, iteration_size, iteration_stride, repeat_count
if (inputSizes[3] > 1) {
if (inputStrides[3] > 0) {
iteration_size = IntegerAttr::get(i32ty, sizes[3]);
iteration_stride = IntegerAttr::get(i32ty, strides[3]);
} else {
// We allow users to encode the repeat_count as a dimension 3 stride
// of 0. This must lower to a iteration wrap of 0, so no stride is
// ever added. We then repeat the BD using the repeat_count in
// NpuPushQueueOp.
iteration_size = zero;
iteration_stride = zero;
}
}
// iteration_current, iteration_size, iteration_stride, repeat_count
if (inputSizes[3] > 1) {
if (inputStrides[3] > 0) {
iteration_size = IntegerAttr::get(i32ty, sizes[3]);
iteration_stride = IntegerAttr::get(i32ty, strides[3]);
} else {
// We allow users to encode the repeat_count as a dimension 3 stride
// of 0. This must lower to a iteration wrap of 0, so no stride is
// ever added. We then repeat the BD using the repeat_count in
// NpuPushQueueOp.
iteration_size = zero;
iteration_stride = zero;
}
repeat_count = IntegerAttr::get(i32ty, sizes[3]);
}
repeat_count = IntegerAttr::get(i32ty, sizes[3]);

// next_bd

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ The signature of the `aie.runtime_sequence()` operation lists as its arguments a
* For each `tile_row` in the current row block:
* The DMA transfer function `npu_dma_memcpy_nd` loads a segment of matrix A and matrix B data (submatrix a, submatrix b) from the host into the corresponding `inA_fifos` for the respective column, maintaining the appropriate strides and offsets.
* Analogously to the data layout transformations described [further above](#tiling-and-data-layout-transformations) to translate a `m`&times;`k` matrix into blocks of `r`&times;`s`-submatrices, this transfer translates the input `M`&times;`K` and `K`&times;`N` matrices into submatrices of size `m`&times;`k` and `k`&times;`n`.
> Note that data layout transformations in the `npu_dma_memcpy_nd` operation are expressed in units of 4 bytes. This is why you will see all strides and the lowest-dimension length multiplied by a factor of `word_size_in` or `word_size_out` (to get the size in bytes) and then divided by four (to get the size in units of 4 bytes). This discrepancy will be streamlined in future versions.
* The DMA transfer function `npu_dma_memcpy_nd` sends a segment of matrix C data (submatrix c) from the corresponding `outC_fifos` for the respective column, back to the host while maintaining the appropriate strides and offsets.
* After completing DMA transfers for each column, `dma_wait` is used to synchronize their completion.

Expand Down
77 changes: 77 additions & 0 deletions test/npu-xrt/nd_memcpy_linear_repeat/aie2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 AMD Inc.

# REQUIRES: ryzen_ai, valid_xchess_license
#
# RUN: %python %S/aie2.py > ./aie2.mlir
# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
# RUN: %run_on_npu ./test.exe | FileCheck %s
# CHECK: PASS!

import numpy as np
from aie.extras.context import mlir_mod_ctx

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.helpers.dialects.ext.scf import _for as range_

dtype = np.int16
repeat_count = 3
a_len = 2048
c_len = a_len * repeat_count


def design():

with mlir_mod_ctx() as ctx:

@device(AIEDevice.npu1_4col)
def device_body():
a_ty = np.ndarray[(a_len,), np.dtype[dtype]]
c_ty = np.ndarray[(c_len,), np.dtype[dtype]]

ShimTile = tile(0, 0)
ComputeTile = tile(0, 2)
fifo_a = object_fifo("fifo_a", ShimTile, ComputeTile, 2, a_ty)
fifo_c = object_fifo("fifo_c", ComputeTile, ShimTile, 2, a_ty)

# Core
@core(ComputeTile)
def core_body():
for _ in range_(0, 0xFFFFFFFF):
for i in range_(repeat_count):
elem_c = fifo_c.acquire(ObjectFifoPort.Produce, 1)
elem_a = fifo_a.acquire(ObjectFifoPort.Consume, 1)
for i in range_(a_len):
elem_c[i] = elem_a[i]
fifo_a.release(ObjectFifoPort.Consume, 1)
fifo_c.release(ObjectFifoPort.Produce, 1)

# To/from AIE-array data movement
@runtime_sequence(a_ty, a_ty, c_ty)
def sequence(A, _B, C):
npu_dma_memcpy_nd(
metadata=fifo_a,
bd_id=1,
mem=A,
sizes=[repeat_count, 1, 1, a_len],
strides=[0, 0, 0, 1],
)
npu_dma_memcpy_nd(
metadata=fifo_c,
bd_id=0,
mem=C,
sizes=[1, 1, 1, c_len],
strides=[0, 0, 0, 1],
)
dma_wait(fifo_c)

print(ctx.module)


design()
122 changes: 122 additions & 0 deletions test/npu-xrt/nd_memcpy_linear_repeat/test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// (c) Copyright 2024 AMD Inc.

#include <cassert>
#include <cstring>
#include <fstream>
#include <iomanip>

#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"

#include "test_utils.h"

#ifndef XCLBIN
#define XCLBIN "final.xclbin"
#endif

#ifndef INSTS_TXT
#define INSTS_TXT "insts.txt"
#endif

#ifndef KERNEL_NAME
#define KERNEL_NAME "MLIR_AIE"
#endif

#define DTYPE int16_t
#define A_DATATYPE DTYPE
#define C_DATATYPE DTYPE

#define A_LEN 2048
#define REPEAT_COUNT 3
#define C_LEN (A_LEN * REPEAT_COUNT)

#define A_SIZE (A_LEN * sizeof(A_DATATYPE)) // in bytes
#define B_SIZE A_SIZE // in bytes
#define C_SIZE (C_LEN * sizeof(C_DATATYPE)) // in bytes

int main(int argc, const char *argv[]) {

std::vector<uint32_t> instr_v = test_utils::load_instr_sequence(INSTS_TXT);
assert(instr_v.size() > 0);

// Get a device handle
unsigned int device_index = 0;
xrt::device device = xrt::device(device_index);

// Load the xclbin
xrt::xclbin xclbin = xrt::xclbin(XCLBIN);

// Get the kernel from the xclbin
std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
xrt::xclbin::kernel xkernel = *std::find_if(
xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
return k.get_name().rfind(KERNEL_NAME, 0) == 0;
});
std::string kernel_name = xkernel.get_name();
assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);

device.register_xclbin(xclbin);

// get a hardware context
xrt::hw_context context(device, xclbin.get_uuid());

// get a kernel handle
auto kernel = xrt::kernel(context, kernel_name);

auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
auto bo_a =
xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
auto bo_b =
xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
auto bo_c =
xrt::bo(device, C_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));

A_DATATYPE *buf_a = bo_a.map<A_DATATYPE *>();
for (int i = 0; i < A_SIZE / sizeof(buf_a[0]); i++) {
buf_a[i] = 2 * i; // even
}
C_DATATYPE *buf_c = bo_c.map<C_DATATYPE *>();
memset(buf_c, 0, C_SIZE);

// Instruction buffer for DMA configuration
void *bufInstr = bo_instr.map<void *>();
memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));

bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_c.sync(XCL_BO_SYNC_BO_TO_DEVICE);

unsigned int opcode = 3;
auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_c);
ert_cmd_state r = run.wait();
if (r != ERT_CMD_STATE_COMPLETED) {
std::cout << "Kernel did not complete. Returned status: " << r << "\n";
return 1;
}

bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE);

int errors = 0;
for (int i = 0; i < C_SIZE / sizeof(buf_c[0]); i++) {
std::cout << std::setw(4) << (long)buf_c[i] << " ";
if (buf_c[i] != buf_a[i % A_LEN]) {
errors += 1;
}
}
std::cout << std::endl;

if (errors == 0) {
std::cout << "PASS!" << std::endl;
} else {
std::cout << "FAIL." << std::endl;
}

return 0;
}
Loading