Xilinx · hunhoffe · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024
@@ -140,7 +140,7 @@ def BDDimLayoutAttr : AttrDef<AIE_Dialect, "BDDimLayout", []> {
   }];
 
   let parameters = (ins
-    "uint16_t" : $size,
+    "uint32_t" : $size,
     "uint32_t" : $stride
   );
 

@@ -443,9 +443,11 @@ LogicalResult AIEX::NpuPushQueueOp::verify() {
 LogicalResult AIEX::NpuWriteBdOp::verify() {
   const auto &targetModel = AIE::getTargetModel(*this);
   auto numBds = targetModel.getNumBDs(getColumn(), getRow());
+  bool isLinearTransfer =
+      (getD0Size() >= 1) && (getD1Size() == 1) && (getIterationSize() == 0);
   if (getBdId() > numBds)
     return emitOpError("BD ID exceeds the maximum ID.");
-  if (getD0Size() > 0x3FF)
+  if (!isLinearTransfer && getD0Size() > 0x3FF)
     return emitOpError("D0 Size exceeds the [0:1023] range.");
   if (getD0Stride() > 0xFFFFF)
     return emitOpError("D0 Stride exceeds the [0:1M-1] range.");

@@ -252,6 +252,8 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
         return bd_op->emitOpError("At most four data layout transformation "
                                   "dimensions may be provided.");
       }
+      bool isLinearTransfer = (input_sizes[0] >= 1) && (input_sizes[1] == 1) &&
+                              (input_sizes[2] == 1) && (input_sizes[3] == 1);
       for (size_t i = 0; i < dims->size(); i++) {
         // Pass down dimensions in reverse order; in the MLIR, this allows
         // us to specify step sizes/wraps in the same order as we would
@@ -264,7 +266,7 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
                               input_strides, sizes, strides);
       if (failed(verifyStridesWraps(bd_op, buffer_type, tile.getCol(),
                                     tile.getRow(), input_sizes, input_strides,
-                                    sizes, strides))) {
+                                    sizes, strides, isLinearTransfer))) {
         return failure();
       }
       // Ensure the total transfer length and the length expressed in the lowest

@@ -20,7 +20,14 @@ targetname = dmaTranspose
 M ?= 64
 K ?= 32
 
-build/aie.mlir: ${srcdir}/aie2.py
+aie_py_src=aie2.py
+use_alt?=0
+
+ifeq (${use_alt}, 1)
+aie_py_src=aie2_alt.py
+endif
+
+build/aie.mlir: ${srcdir}/${aie_py_src}
 	mkdir -p ${@D}
 	python3 $< ${M} ${K} > $@
 

@@ -0,0 +1,77 @@
+# dma_transpose/aie2.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+import numpy as np
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.extras.context import mlir_mod_ctx
+from aie.helpers.dialects.ext.scf import _for as range_
+
+N = 4096
+M = 64
+K = 64
+
+if len(sys.argv) == 3:
+    M = int(sys.argv[1])
+    K = int(sys.argv[2])
+    N = M * K
+
+tensor_ty = np.ndarray[(M, K), np.dtype[np.int32]]
+
+
+def my_passthrough():
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.npu1_1col)
+        def device_body():
+            # Tile declarations
+            ShimTile = tile(0, 0)
+            ComputeTile2 = tile(0, 2)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, ComputeTile2, 2, tensor_ty)
+            of_out = object_fifo("out", ComputeTile2, ShimTile, 2, tensor_ty)
+            object_fifo_link(of_in, of_out)
+
+            # Set up compute tiles
+
+            # Compute tile 2
+            @core(ComputeTile2)
+            def core_body():
+                for _ in range_(sys.maxsize):
+                    pass
+
+            # To/from AIE-array data movement
+            @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
+            def sequence(A, B, C):
+                # The strides below are configured to read across all rows in the same column
+                # Stride of K in dim/wrap 2 skips an entire row to read a full column
+                in_task = dma_configure_task_for(of_in, issue_token=True)
+                with bds(in_task) as bd:
+                    with bd[0]:
+                        shim_dma_bd(
+                            A,
+                            sizes=[1, 1, K, M],
+                            strides=[1, 1, 1, K],
+                        )
+                        EndOp()
+
+                out_task = dma_configure_task_for(of_out, issue_token=True)
+                with bds(out_task) as bd:
+                    with bd[0]:
+                        shim_dma_bd(C, sizes=[1, 1, 1, N])
+                        EndOp()
+
+                dma_start_task(in_task, out_task)
+                dma_await_task(in_task, out_task)
+
+    print(ctx.module)
+
+
+my_passthrough()
@@ -0,0 +1,12 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, peano 
+//
+// RUN: mkdir -p test_alt
+// RUN: cd test_alt
+// RUN: make -f %S/Makefile clean
+// RUN: env use_alt=1 make -f %S/Makefile 
+// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
+// CHECK: PASS!
+
@@ -22,6 +22,11 @@ n_aie_cols?=4
 kernels=mm_${m}x${k}x${n}
 aieargs+=-m $m -k $k -n $n --n-aie-cols ${n_aie_cols}
 target_suffix=${M}x${K}x${N}_${m}x${k}x${n}_${n_aie_cols}c
+use_alt?=0
+
+ifeq (${use_alt}, 1)
+aie_py_src=aie2_alt.py
+endif
 
 include ${srcdir}/../makefile-common