Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using objectFifo link to access the shared memory between compute tiles #1814

Merged
merged 52 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
3684d34
Using shared memory when available with links
pvasireddy-amd Oct 1, 2024
d724fae
Clang format and check messages
pvasireddy-amd Oct 2, 2024
cfeeeaa
Merge branch 'main' of https://github.com/Xilinx/mlir-aie
pvasireddy-amd Oct 2, 2024
b5cc5c2
Working example and required changes
pvasireddy-amd Oct 17, 2024
1b96ed4
Merge branch 'main' into link_with_shared_mem
pvasireddy-amd Oct 17, 2024
3b1b02e
Example code
pvasireddy-amd Oct 17, 2024
dff1349
Merge branch 'link_with_shared_mem' of https://github.com/Xilinx/mlir…
pvasireddy-amd Oct 17, 2024
110910d
clang format
pvasireddy-amd Oct 17, 2024
cf5035b
Removing unnecessary
pvasireddy-amd Oct 17, 2024
e0ec2ca
Lit test change
pvasireddy-amd Oct 17, 2024
14659dd
Python format
pvasireddy-amd Oct 17, 2024
08bce2e
Python format
pvasireddy-amd Oct 17, 2024
55c7d77
Cleanup of if-else
pvasireddy-amd Oct 18, 2024
21cdf1e
Clang format
pvasireddy-amd Oct 18, 2024
0fe5c94
Merge branch 'main' of https://github.com/Xilinx/mlir-aie
pvasireddy-amd Oct 23, 2024
70c8b9f
Updated with main
pvasireddy-amd Oct 23, 2024
16990f3
Naming
pvasireddy-amd Oct 23, 2024
ef34f52
Merge branch 'main' into link_with_shared_mem
pvasireddy-amd Oct 28, 2024
146e0f5
Merge branch 'main' into link_with_shared_mem
pvasireddy-amd Nov 4, 2024
9a5f0cc
Extension for link with different memref types
pvasireddy-amd Nov 5, 2024
5c8f609
Name change
pvasireddy-amd Nov 5, 2024
0814a51
More test cases
pvasireddy-amd Nov 5, 2024
9d4410a
formatting
pvasireddy-amd Nov 5, 2024
07ac11c
CHECK
pvasireddy-amd Nov 5, 2024
f0a60ba
Adding flag to run command
pvasireddy-amd Nov 5, 2024
63e52c6
Merge branch 'main' into dyn_objFifo_fix
pvasireddy-amd Nov 5, 2024
21b5a0f
Correcting the CHECK messsage
pvasireddy-amd Nov 5, 2024
b41a324
Merge branch 'dyn_objFifo_fix' of https://github.com/Xilinx/mlir-aie …
pvasireddy-amd Nov 5, 2024
f5d41eb
Verifying as programming example
pvasireddy-amd Nov 5, 2024
77513c8
Checking the flags that caused the issue
pvasireddy-amd Nov 5, 2024
29cf679
Revert "Checking the flags that caused the issue"
pvasireddy-amd Nov 5, 2024
1db8fbf
Maybe clang version
pvasireddy-amd Nov 5, 2024
a3bd519
Revert "Maybe clang version"
pvasireddy-amd Nov 5, 2024
6a0aa83
Merge branch 'dyn_objFifo_fix' of https://github.com/Xilinx/mlir-aie …
pvasireddy-amd Nov 6, 2024
b083a6a
Merge branch 'main' into link_with_shared_mem
pvasireddy-amd Nov 6, 2024
5eb474a
Merge branch 'link_with_shared_mem' of https://github.com/Xilinx/mlir…
pvasireddy-amd Nov 6, 2024
2d3d06a
Moving to test folder
pvasireddy-amd Nov 6, 2024
e145256
Change in file name
pvasireddy-amd Nov 6, 2024
f740fc0
Removing example; now in test/ folder
pvasireddy-amd Nov 6, 2024
c67e9be
Merge branch 'link_with_shared_mem' of https://github.com/Xilinx/mlir…
pvasireddy-amd Nov 13, 2024
7773462
Copyright
pvasireddy-amd Nov 13, 2024
6e5c687
Some changes
pvasireddy-amd Nov 13, 2024
cf957b5
Merge branch 'main' of https://github.com/Xilinx/mlir-aie into link_w…
pvasireddy-amd Nov 13, 2024
77d5dcc
Changed when merging
pvasireddy-amd Nov 13, 2024
65679ff
Other changes
pvasireddy-amd Nov 13, 2024
997866c
More cleanup
pvasireddy-amd Nov 14, 2024
2692821
Merge branch 'main' of https://github.com/Xilinx/mlir-aie into link_w…
pvasireddy-amd Nov 14, 2024
8410d34
Bringing back lost information
pvasireddy-amd Nov 14, 2024
d8923af
Formatting
pvasireddy-amd Nov 14, 2024
1c70c22
Added core to the test
pvasireddy-amd Nov 15, 2024
d75c61e
Removed unnecessary external buffers in test cases
pvasireddy-amd Nov 15, 2024
5e195fe
Merge branch 'main' into link_with_shared_mem
pvasireddy-amd Nov 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 51 additions & 9 deletions lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,12 +220,40 @@ struct AIEObjectFifoStatefulTransformPass
}
}

// Only test for this objfifo belonging to a LinkOp if we are in the shared
// memory case; otherwise, we will return `true` in any case.
// Check if the objectfifo operation can use shared memory for linking. If
// the link operation is a distribute or a join operation, or if the link
// has different memref types, DMAs are required even if shared memory is
// available and the objectfifo should be split. Otherwise also check if the
// via_shared_memory attribute of the objectfifo operation is set and try to
// apply it.
if (hasSharedMemory) {
if (auto linkOp = getOptionalLinkOp(createOp)) {
splitBecauseLink.push_back(createOp);
isUsedInLinkOp = true;
int share_dir = 0;
if (!linkOp->isDistribute() && !linkOp->isJoin()) {
auto fifoInType = llvm::cast<AIEObjectFifoType>(
linkOp->getInputObjectFifos()[0].getElemType());
auto producerType =
llvm::cast<MemRefType>(fifoInType.getElementType());
auto fifoOutType = llvm::cast<AIEObjectFifoType>(
linkOp->getOutputObjectFifos()[0].getElemType());
auto consumerType =
llvm::cast<MemRefType>(fifoOutType.getElementType());
if (consumerType != producerType) {
// TODO: Support for different memref types through shared
// memory without DMAs
splitBecauseLink.push_back(createOp);
}
if (createOp.getViaSharedMem().has_value()) {
checkAndApplyViaSharedMemAttribute(createOp, share_dir);
if (share_direction == share_dir)
isUsedInLinkOp = false;
else
splitBecauseLink.push_back(createOp);
}
} else {
splitBecauseLink.push_back(createOp);
}
}
}

Expand Down Expand Up @@ -1734,17 +1762,31 @@ struct AIEObjectFifoStatefulTransformPass
//===----------------------------------------------------------------===//
coreOp.walk([&](ObjectFifoSubviewAccessOp accessOp) {
auto acqOp = accessOp.getSubview().getDefiningOp<ObjectFifoAcquireOp>();
if (ObjectFifoCreateOp op = acqOp.getObjectFifo();
getOptionalLinkOp(op)) {
accessOp->emitOpError("currently cannot access objectFifo used in "
"ObjectFifoLinkOp");
return;
if (ObjectFifoCreateOp op = acqOp.getObjectFifo()) {
if (auto linkOp = getOptionalLinkOp(op); linkOp.has_value()) {
if (!linkOp->isDistribute() && !linkOp->isJoin()) {
for (auto consumerTile : op.getConsumerTiles()) {
if (auto consumerTileOp =
dyn_cast<TileOp>(consumerTile.getDefiningOp())) {
int share_dir_value = 0;
bool sharing = isSharedMemory(
op.getProducerTileOp(), consumerTileOp, &share_dir_value);
if (!sharing)
accessOp->emitOpError(
"currently cannot access objectFifo used in "
"ObjectFifoLinkOp if the tiles don't share memory");
}
}
} else
accessOp->emitOpError(
"currently cannot access objectFifo used in "
"ObjectFifoLinkOp if it is a distribute or join link");
}
}
accessOp.getOutput().replaceAllUsesWith(
subviews[acqOp][accessOp.getIndex()]->getBuffer());
});
}

// make global symbols to replace the to be erased ObjectFifoCreateOps
for (auto createOp : device.getOps<ObjectFifoCreateOp>()) {
builder.setInsertionPointToStart(&device.getBodyRegion().front());
Expand Down
66 changes: 66 additions & 0 deletions programming_examples/dyn_objFifo/nested_loops/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
##===- Makefile -----------------------------------------------------------===##
#
# This file licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# Copyright (C) 2024, Advanced Micro Devices, Inc.
#
##===----------------------------------------------------------------------===##

# ---

# The following environment variables that point to the Xilinx runtime (XRT)
# should be set up by an environment setup script already.
XILINX_XRT?=/opt/xilinx/xrt
XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../)

# ---

srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

XILINX_XRT_INCLUDE?=${XILINX_XRT}/include
XILINX_XRT_LIB?=${XILINX_XRT}/lib

CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include
XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB}
XRT_LIBS=-lxrt_coreutil
CXX=g++-13 -ggdb

#mlir_target?=build/aie.mlir
xclbin_target?=build/final.xclbin
insts_target?=build/insts.txt
host_target?=build/test

.PHONY: all
all: ${xclbin_target} ${host_target}

build/aie.mlir: ${srcdir}/aie2.py
mkdir -p ${@D}
python3 $< > $@

build/kernel.o: ${srcdir}/kernel.cc
mkdir -p ${@D}
cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}

${xclbin_target}: build/aie.mlir build/kernel.o
mkdir -p ${@D}
cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%}

${host_target}: ${srcdir}/test.cpp ${xclbin_target}
mkdir -p ${@D}
${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS}

.PHONY: run
run: ${host_target}
./${host_target}

xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh
.PHONY: sign
sign: ${xclbin_target}
${xclbin_sign} -dev Phoenix -xclbin $<

.PHONY: clean
clean:
-rm -r build
73 changes: 73 additions & 0 deletions programming_examples/dyn_objFifo/nested_loops/aie2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 AMD Inc.

# REQUIRES: ryzen_ai, valid_xchess_license
#
# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
# RUN: %python %S/aie2.py > ./aie2.mlir
# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
# RUN: %run_on_npu ./test.exe | FileCheck %s
# CHECK: PASS!
import numpy as np

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.helpers.dialects.ext.scf import _for as range_
from aie.extras.context import mlir_mod_ctx

N = 50
O = 250
n_rows = 5
dev = AIEDevice.npu1_1col
col = 0


def nested_loops():
with mlir_mod_ctx() as ctx:

@device(dev)
def device_body():
tensor_ty = np.ndarray[(N // n_rows,), np.dtype[np.int32]]

# Tile declarations
ShimTile = tile(col, 0)
ComputeTile = tile(col, 2)

# AIE-array data movement with object fifos
of_in = object_fifo("in", ShimTile, ComputeTile, 2, tensor_ty)
of_out = object_fifo("out", ComputeTile, ShimTile, 2, tensor_ty)

# AIE Core Function declarations
passthrough_10_i32 = external_func(
"passthrough_10_i32", inputs=[tensor_ty, tensor_ty]
)

# Set up compute tiles
@core(ComputeTile, "kernel.o")
def core_body():
for _ in range_(5):
elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
for _ in range_(5):
elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
passthrough_10_i32(elemIn, elemOut)
of_out.release(ObjectFifoPort.Produce, 1)
of_in.release(ObjectFifoPort.Consume, 1)

# To/from AIE-array data movement
@runtime_sequence(tensor_ty, tensor_ty)
def sequence(A, C):
npu_dma_memcpy_nd(
metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N], issue_token=True
)
npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, O])
dma_wait(of_in, of_out)

print(ctx.module)


nested_loops()
22 changes: 22 additions & 0 deletions programming_examples/dyn_objFifo/nested_loops/kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// (c) Copyright 2024 AMD Inc.

#include <aie_api/aie.hpp>

template <typename T_in, typename T_out, unsigned long N>
void passthrough(const T_in *__restrict in, T_out *__restrict out) {
for (int i = 0; i < N; i++) {
out[i] = in[i];
}
}

extern "C" {

void passthrough_10_i32(const int *__restrict in, int *__restrict out) {
passthrough<int, int, 10>(in, out);
}
}
139 changes: 139 additions & 0 deletions programming_examples/dyn_objFifo/nested_loops/test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// (c) Copyright 2024 AMD Inc.

#include <cassert>
#include <cstring>
#include <fstream>
#include <iomanip>

#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"

#ifndef XCLBIN
#define XCLBIN "build/final.xclbin"
#endif

#ifndef INSTS_TXT
#define INSTS_TXT "build/insts.txt"
#endif

#ifndef KERNEL_NAME
#define KERNEL_NAME "MLIR_AIE"
#endif

#define INPUT_SIZE (50 * sizeof(int)) // in bytes
#define OUTPUT_SIZE (250 * sizeof(int)) // in bytes
#define WIDTH_SIZE (10 * sizeof(int)) // in bytes
#define WIDTH 10
#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE

std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
std::ifstream instr_file(instr_path);
std::string line;
std::vector<uint32_t> instr_v;
while (std::getline(instr_file, line)) {
std::istringstream iss(line);
uint32_t a;
if (!(iss >> std::hex >> a)) {
throw std::runtime_error("Unable to parse instruction file\n");
}
instr_v.push_back(a);
}
return instr_v;
}

int main(int argc, const char *argv[]) {

std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
assert(instr_v.size() > 0);

// Get a device handle
unsigned int device_index = 0;
xrt::device device = xrt::device(device_index);

// Load the xclbin
xrt::xclbin xclbin = xrt::xclbin(XCLBIN);

// Get the kernel from the xclbin
std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
xrt::xclbin::kernel xkernel = *std::find_if(
xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
return k.get_name().rfind(KERNEL_NAME, 0) == 0;
});
std::string kernel_name = xkernel.get_name();
assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);

device.register_xclbin(xclbin);

// get a hardware context
xrt::hw_context context(device, xclbin.get_uuid());

// get a kernel handle
auto kernel = xrt::kernel(context, kernel_name);

auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
auto bo_input =
xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
auto bo_output =
xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));

int *buf_input = bo_input.map<int *>();
std::cout << std::endl << std::endl << "Input: " << std::endl;
for (int i = 0; i < INPUT_ROWS; i++) {
std::cout << "row " << i << " : ";
for (int j = 0; j < WIDTH; j++) {
buf_input[i * WIDTH + j] = i;
std::cout << buf_input[i * WIDTH + j] << " ";
}
std::cout << std::endl << std::endl;
}
int *buf_output = bo_output.map<int *>();
memset(buf_output, 0, OUTPUT_SIZE);

// Instruction buffer for DMA configuration
void *buf_instr = bo_instr.map<void *>();
memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int));

bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE);

unsigned int opcode = 3;
auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output);
ert_cmd_state r = run.wait();
if (r != ERT_CMD_STATE_COMPLETED) {
std::cout << "Kernel did not complete. Returned status: " << r << "\n";
return 1;
}

bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE);

bool pass = true;
std::cout << std::endl << "Output: " << std::endl;
int expected_output = 0;
int five_repetitions = 0;
for (int i = 0; i < OUTPUT_ROWS; i++) {
std::cout << "row " << i << std::endl;
if (five_repetitions == 5) {
expected_output++;
five_repetitions = 0;
}
for (int j = 0; j < WIDTH; j++) {
std::cout << "expected: " << expected_output << ", ";
std::cout << "got: " << buf_output[i * WIDTH + j] << std::endl;
pass &= buf_output[i * WIDTH + j] == expected_output;
}
std::cout << std::endl << std::endl;
five_repetitions++;
}
std::cout << std::endl << std::endl;
std::cout << (pass ? "PASS!" : "FAIL.") << std::endl;

return 0;
}
Loading
Loading