Merge branch 'main' into port-examples-dma-task

Xilinx · Nov 15, 2024 · 22ec2f0 · 22ec2f0
2 parents 5931e38 + 3c91dcf
commit 22ec2f0
Show file tree

Hide file tree

Showing 12 changed files with 923 additions and 55 deletions.
diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
@@ -220,12 +220,40 @@ struct AIEObjectFifoStatefulTransformPass
         }
     }
 
-    // Only test for this objfifo belonging to a LinkOp if we are in the shared
-    // memory case; otherwise, we will return `true` in any case.
+    // Check if the objectfifo operation can use shared memory for linking. If
+    // the link operation is a distribute or a join operation, or if the link
+    // has different memref types, DMAs are required even if shared memory is
+    // available and the objectfifo should be split. Otherwise also check if the
+    // via_shared_memory attribute of the objectfifo operation is set and try to
+    // apply it.
     if (hasSharedMemory) {
       if (auto linkOp = getOptionalLinkOp(createOp)) {
-        splitBecauseLink.push_back(createOp);
         isUsedInLinkOp = true;
+        int share_dir = 0;
+        if (!linkOp->isDistribute() && !linkOp->isJoin()) {
+          auto fifoInType = llvm::cast<AIEObjectFifoType>(
+              linkOp->getInputObjectFifos()[0].getElemType());
+          auto producerType =
+              llvm::cast<MemRefType>(fifoInType.getElementType());
+          auto fifoOutType = llvm::cast<AIEObjectFifoType>(
+              linkOp->getOutputObjectFifos()[0].getElemType());
+          auto consumerType =
+              llvm::cast<MemRefType>(fifoOutType.getElementType());
+          if (consumerType != producerType) {
+            // TODO: Support for different memref types through shared
+            // memory without DMAs
+            splitBecauseLink.push_back(createOp);
+          }
+          if (createOp.getViaSharedMem().has_value()) {
+            checkAndApplyViaSharedMemAttribute(createOp, share_dir);
+            if (share_direction == share_dir)
+              isUsedInLinkOp = false;
+            else
+              splitBecauseLink.push_back(createOp);
+          }
+        } else {
+          splitBecauseLink.push_back(createOp);
+        }
       }
     }
 
@@ -1734,17 +1762,31 @@ struct AIEObjectFifoStatefulTransformPass
       //===----------------------------------------------------------------===//
       coreOp.walk([&](ObjectFifoSubviewAccessOp accessOp) {
         auto acqOp = accessOp.getSubview().getDefiningOp<ObjectFifoAcquireOp>();
-        if (ObjectFifoCreateOp op = acqOp.getObjectFifo();
-            getOptionalLinkOp(op)) {
-          accessOp->emitOpError("currently cannot access objectFifo used in "
-                                "ObjectFifoLinkOp");
-          return;
+        if (ObjectFifoCreateOp op = acqOp.getObjectFifo()) {
+          if (auto linkOp = getOptionalLinkOp(op); linkOp.has_value()) {
+            if (!linkOp->isDistribute() && !linkOp->isJoin()) {
+              for (auto consumerTile : op.getConsumerTiles()) {
+                if (auto consumerTileOp =
+                        dyn_cast<TileOp>(consumerTile.getDefiningOp())) {
+                  int share_dir_value = 0;
+                  bool sharing = isSharedMemory(
+                      op.getProducerTileOp(), consumerTileOp, &share_dir_value);
+                  if (!sharing)
+                    accessOp->emitOpError(
+                        "currently cannot access objectFifo used in "
+                        "ObjectFifoLinkOp if the tiles don't share memory");
+                }
+              }
+            } else
+              accessOp->emitOpError(
+                  "currently cannot access objectFifo used in "
+                  "ObjectFifoLinkOp if it is a distribute or join link");
+          }
         }
         accessOp.getOutput().replaceAllUsesWith(
             subviews[acqOp][accessOp.getIndex()]->getBuffer());
       });
     }
-
     // make global symbols to replace the to be erased ObjectFifoCreateOps
     for (auto createOp : device.getOps<ObjectFifoCreateOp>()) {
       builder.setInsertionPointToStart(&device.getBodyRegion().front());

diff --git a/programming_examples/dyn_objFifo/nested_loops/Makefile b/programming_examples/dyn_objFifo/nested_loops/Makefile
@@ -0,0 +1,66 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# 
+##===----------------------------------------------------------------------===##
+
+# ---
+
+# The following environment variables that point to the Xilinx runtime (XRT)
+# should be set up by an environment setup script already.
+XILINX_XRT?=/opt/xilinx/xrt
+XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../)
+
+# ---
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+XILINX_XRT_INCLUDE?=${XILINX_XRT}/include
+XILINX_XRT_LIB?=${XILINX_XRT}/lib
+
+CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include 
+XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB}
+XRT_LIBS=-lxrt_coreutil
+CXX=g++-13 -ggdb 
+
+#mlir_target?=build/aie.mlir
+xclbin_target?=build/final.xclbin
+insts_target?=build/insts.txt
+host_target?=build/test
+
+.PHONY: all
+all: ${xclbin_target} ${host_target}
+
+build/aie.mlir: ${srcdir}/aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+build/kernel.o: ${srcdir}/kernel.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}
+
+${xclbin_target}: build/aie.mlir build/kernel.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%}
+
+${host_target}: ${srcdir}/test.cpp ${xclbin_target}
+	mkdir -p ${@D}
+	${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS}
+
+.PHONY: run
+run: ${host_target}
+	./${host_target}
+
+xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh 
+.PHONY: sign
+sign: ${xclbin_target}
+	${xclbin_sign} -dev Phoenix -xclbin $<
+
+.PHONY: clean
+clean:
+	-rm -r build
diff --git a/programming_examples/dyn_objFifo/nested_loops/aie2.py b/programming_examples/dyn_objFifo/nested_loops/aie2.py
@@ -0,0 +1,73 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+# REQUIRES: ryzen_ai, valid_xchess_license
+#
+# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
+# RUN: %python %S/aie2.py > ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+# RUN: %run_on_npu ./test.exe | FileCheck %s
+# CHECK: PASS!
+import numpy as np
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.helpers.dialects.ext.scf import _for as range_
+from aie.extras.context import mlir_mod_ctx
+
+N = 50
+O = 250
+n_rows = 5
+dev = AIEDevice.npu1_1col
+col = 0
+
+
+def nested_loops():
+    with mlir_mod_ctx() as ctx:
+
+        @device(dev)
+        def device_body():
+            tensor_ty = np.ndarray[(N // n_rows,), np.dtype[np.int32]]
+
+            # Tile declarations
+            ShimTile = tile(col, 0)
+            ComputeTile = tile(col, 2)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, ComputeTile, 2, tensor_ty)
+            of_out = object_fifo("out", ComputeTile, ShimTile, 2, tensor_ty)
+
+            # AIE Core Function declarations
+            passthrough_10_i32 = external_func(
+                "passthrough_10_i32", inputs=[tensor_ty, tensor_ty]
+            )
+
+            # Set up compute tiles
+            @core(ComputeTile, "kernel.o")
+            def core_body():
+                for _ in range_(5):
+                    elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
+                    for _ in range_(5):
+                        elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
+                        passthrough_10_i32(elemIn, elemOut)
+                        of_out.release(ObjectFifoPort.Produce, 1)
+                    of_in.release(ObjectFifoPort.Consume, 1)
+
+            # To/from AIE-array data movement
+            @runtime_sequence(tensor_ty, tensor_ty)
+            def sequence(A, C):
+                npu_dma_memcpy_nd(
+                    metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N], issue_token=True
+                )
+                npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, O])
+                dma_wait(of_in, of_out)
+
+    print(ctx.module)
+
+
+nested_loops()
diff --git a/programming_examples/dyn_objFifo/nested_loops/kernel.cc b/programming_examples/dyn_objFifo/nested_loops/kernel.cc
@@ -0,0 +1,22 @@
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <aie_api/aie.hpp>
+
+template <typename T_in, typename T_out, unsigned long N>
+void passthrough(const T_in *__restrict in, T_out *__restrict out) {
+  for (int i = 0; i < N; i++) {
+    out[i] = in[i];
+  }
+}
+
+extern "C" {
+
+void passthrough_10_i32(const int *__restrict in, int *__restrict out) {
+  passthrough<int, int, 10>(in, out);
+}
+}
diff --git a/programming_examples/dyn_objFifo/nested_loops/test.cpp b/programming_examples/dyn_objFifo/nested_loops/test.cpp
@@ -0,0 +1,139 @@
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#ifndef XCLBIN
+#define XCLBIN "build/final.xclbin"
+#endif
+
+#ifndef INSTS_TXT
+#define INSTS_TXT "build/insts.txt"
+#endif
+
+#ifndef KERNEL_NAME
+#define KERNEL_NAME "MLIR_AIE"
+#endif
+
+#define INPUT_SIZE (50 * sizeof(int))   // in bytes
+#define OUTPUT_SIZE (250 * sizeof(int)) // in bytes
+#define WIDTH_SIZE (10 * sizeof(int))   // in bytes
+#define WIDTH 10
+#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
+#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+
+  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
+  assert(instr_v.size() > 0);
+
+  // Get a device handle
+  unsigned int device_index = 0;
+  xrt::device device = xrt::device(device_index);
+
+  // Load the xclbin
+  xrt::xclbin xclbin = xrt::xclbin(XCLBIN);
+
+  // Get the kernel from the xclbin
+  std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
+  xrt::xclbin::kernel xkernel = *std::find_if(
+      xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
+        return k.get_name().rfind(KERNEL_NAME, 0) == 0;
+      });
+  std::string kernel_name = xkernel.get_name();
+  assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  auto kernel = xrt::kernel(context, kernel_name);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_input =
+      xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_output =
+      xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+  int *buf_input = bo_input.map<int *>();
+  std::cout << std::endl << std::endl << "Input: " << std::endl;
+  for (int i = 0; i < INPUT_ROWS; i++) {
+    std::cout << "row " << i << " : ";
+    for (int j = 0; j < WIDTH; j++) {
+      buf_input[i * WIDTH + j] = i;
+      std::cout << buf_input[i * WIDTH + j] << " ";
+    }
+    std::cout << std::endl << std::endl;
+  }
+  int *buf_output = bo_output.map<int *>();
+  memset(buf_output, 0, OUTPUT_SIZE);
+
+  // Instruction buffer for DMA configuration
+  void *buf_instr = bo_instr.map<void *>();
+  memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  bool pass = true;
+  std::cout << std::endl << "Output: " << std::endl;
+  int expected_output = 0;
+  int five_repetitions = 0;
+  for (int i = 0; i < OUTPUT_ROWS; i++) {
+    std::cout << "row " << i << std::endl;
+    if (five_repetitions == 5) {
+      expected_output++;
+      five_repetitions = 0;
+    }
+    for (int j = 0; j < WIDTH; j++) {
+      std::cout << "expected: " << expected_output << ", ";
+      std::cout << "got: " << buf_output[i * WIDTH + j] << std::endl;
+      pass &= buf_output[i * WIDTH + j] == expected_output;
+    }
+    std::cout << std::endl << std::endl;
+    five_repetitions++;
+  }
+  std::cout << std::endl << std::endl;
+  std::cout << (pass ? "PASS!" : "FAIL.") << std::endl;
+
+  return 0;
+}