From 2804a357dbc76671407453ed496857348580db7a Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 24 Oct 2024 11:41:10 -0600
Subject: [PATCH 1/6] Add some stricted checks for sizes/strides, as used by
 npu_dma_memcpy_nd

---
 lib/Dialect/AIEX/IR/AIEXDialect.cpp | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
index 12b89aad32..11be552fd8 100644
--- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp
+++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
@@ -189,22 +189,32 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
     return success();
   }
 
-  for (int i = 0; i < 3; i++) {
+  // Don't check inputStrides[3] unless memtiles as it may be 1 to signify
+  // repeat count if shim tile or compute tile
+  int stridesToCheck = targetModel.isMemTile(tileCol, tileRow) ? 4 : 3;
+
+  for (int i = 0; i < stridesToCheck; i++) {
     if (inputSizes[i] > 1 && inputStrides[i] < 1) {
       // If inputSize[i] == 1, anything is allowable in the stride, since that
       // stride will never be applied. For any larger size, we must verify that
       // the stride is positive.
       return forOp->emitOpError("Stride ")
              << i << " must be a positive integer.";
+    } else if (inputSizes[i] == 1 && inputStrides[i] != 0) {
+      forOp->emitWarning("Non-zero stride specified for size of 1; "
+                         "this stride value will have no effect");
     }
   }
-  // A value of zero is allowable for the fourth-dimension stride, as such a
-  // "repeat" can be accomplished by setting size==1 and repeat_count=size.
-  if (inputSizes[3] > 1 && inputStrides[3] < 0) {
-    return forOp->emitOpError("Stride 3 must be a non-negative integer.");
+  // A value of zero is allowable for the last-dimension stride, as such a
+  // "repeat" can be accomplished by setting strides[n]=1 and
+  // size[n]=repeat_count.
+  if (!targetModel.isMemTile(tileCol, tileRow) &&
+      inputSizes[stridesToCheck] > 1 && inputStrides[stridesToCheck] != 1) {
+    return forOp->emitOpError(
+        "Highest stride must be 1 to specify a repeat count.");
   }
 
-  for (int i = 0; i < 4; i++) {
+  for (int i = 0; i < stridesToCheck; i++) {
     // strides[0] == 1 is ok iff the transfer size is a multiple of
     // addressGranularity, which is checked below
     if (i == 0 && inputStrides[i] == 1)

From 4ad022b4b1e570116916472cde022e00576d314c Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 24 Oct 2024 12:30:38 -0600
Subject: [PATCH 2/6] Restore checks to previous state

---
 lib/Dialect/AIEX/IR/AIEXDialect.cpp | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
index 11be552fd8..12b89aad32 100644
--- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp
+++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
@@ -189,32 +189,22 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
     return success();
   }
 
-  // Don't check inputStrides[3] unless memtiles as it may be 1 to signify
-  // repeat count if shim tile or compute tile
-  int stridesToCheck = targetModel.isMemTile(tileCol, tileRow) ? 4 : 3;
-
-  for (int i = 0; i < stridesToCheck; i++) {
+  for (int i = 0; i < 3; i++) {
     if (inputSizes[i] > 1 && inputStrides[i] < 1) {
       // If inputSize[i] == 1, anything is allowable in the stride, since that
       // stride will never be applied. For any larger size, we must verify that
       // the stride is positive.
       return forOp->emitOpError("Stride ")
              << i << " must be a positive integer.";
-    } else if (inputSizes[i] == 1 && inputStrides[i] != 0) {
-      forOp->emitWarning("Non-zero stride specified for size of 1; "
-                         "this stride value will have no effect");
     }
   }
-  // A value of zero is allowable for the last-dimension stride, as such a
-  // "repeat" can be accomplished by setting strides[n]=1 and
-  // size[n]=repeat_count.
-  if (!targetModel.isMemTile(tileCol, tileRow) &&
-      inputSizes[stridesToCheck] > 1 && inputStrides[stridesToCheck] != 1) {
-    return forOp->emitOpError(
-        "Highest stride must be 1 to specify a repeat count.");
+  // A value of zero is allowable for the fourth-dimension stride, as such a
+  // "repeat" can be accomplished by setting size==1 and repeat_count=size.
+  if (inputSizes[3] > 1 && inputStrides[3] < 0) {
+    return forOp->emitOpError("Stride 3 must be a non-negative integer.");
   }
 
-  for (int i = 0; i < stridesToCheck; i++) {
+  for (int i = 0; i < 4; i++) {
     // strides[0] == 1 is ok iff the transfer size is a multiple of
     // addressGranularity, which is checked below
     if (i == 0 && inputStrides[i] == 1)

From c9f586b350e66006334204592e7c05ff4caf382c Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 24 Oct 2024 12:44:48 -0600
Subject: [PATCH 3/6] Try to clarify documentation on repeat_count/iteration
 stride

---
 include/aie/Dialect/AIE/IR/AIEOps.td | 4 ++++
 lib/Dialect/AIEX/IR/AIEXDialect.cpp  | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td
index bcff1fd713..21670b6827 100644
--- a/include/aie/Dialect/AIE/IR/AIEOps.td
+++ b/include/aie/Dialect/AIE/IR/AIEOps.td
@@ -857,6 +857,10 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", []> {
           // access/store element at/to index (i * 16 /*stride_2*/ + j * 1 /*stride_1*/ + k * 2 /*stride_0*/)
     ```
 
+    Note that an additional dimension of sizes/strides is accepted (5th dimension for memtiles, 4th otherwise);
+    the additional size value is interpreted as a repeat count whereas the additional stride value is
+    interpreted as an iteration stride.
+
     #### Important gotcha regarding strides
 
     All strides are expressed in multiples of the element width (just like `len` and `offset`)
diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
index 12b89aad32..df1088e817 100644
--- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp
+++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
@@ -198,8 +198,8 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
              << i << " must be a positive integer.";
     }
   }
-  // A value of zero is allowable for the fourth-dimension stride, as such a
-  // "repeat" can be accomplished by setting size==1 and repeat_count=size.
+  // A value of zero is allowable for the fourth-dimension stride
+  // (this indicates an interation stride for the repeat of 0)
   if (inputSizes[3] > 1 && inputStrides[3] < 0) {
     return forOp->emitOpError("Stride 3 must be a non-negative integer.");
   }

From 833e19062355255252d388f446f04a184365ac84 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 24 Oct 2024 15:10:30 -0600
Subject: [PATCH 4/6] update documentation

---
 .../basic/matrix_multiplication/whole_array/README.md            | 1 -
 1 file changed, 1 deletion(-)

diff --git a/programming_examples/basic/matrix_multiplication/whole_array/README.md b/programming_examples/basic/matrix_multiplication/whole_array/README.md
index 486b008f43..2e57cb5a87 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/README.md
+++ b/programming_examples/basic/matrix_multiplication/whole_array/README.md
@@ -207,7 +207,6 @@ The signature of the `aie.runtime_sequence()` operation lists as its arguments a
     * For each `tile_row` in the current row block:
         * The DMA transfer function `npu_dma_memcpy_nd` loads a segment of matrix A and matrix B data (submatrix a, submatrix b) from the host into the corresponding `inA_fifos` for the respective column, maintaining the appropriate strides and offsets.
         * Analogously to the data layout transformations described [further above](#tiling-and-data-layout-transformations) to translate a `m`&times;`k` matrix into blocks of `r`&times;`s`-submatrices, this transfer translates the input `M`&times;`K` and `K`&times;`N` matrices into submatrices of size `m`&times;`k` and `k`&times;`n`.
-           > Note that data layout transformations in the `npu_dma_memcpy_nd` operation are expressed in units of 4 bytes. This is why you will see all strides and the lowest-dimension length multiplied by a factor of `word_size_in` or `word_size_out` (to get the size in bytes) and then divided by four (to get the size in units of 4 bytes). This discrepancy will be streamlined in future versions.
     * The DMA transfer function `npu_dma_memcpy_nd` sends a segment of matrix C data (submatrix c) from the corresponding `outC_fifos` for the respective column, back to the host while maintaining the appropriate strides and offsets.
     * After completing DMA transfers for each column, `dma_wait` is used to synchronize their completion.
 

From 6d065bb84680e22b190dce07548e35a68706c902 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 24 Oct 2024 15:40:04 -0600
Subject: [PATCH 5/6] Allow repeat count with linear transfer

---
 lib/Dialect/AIEX/IR/AIEXDialect.cpp           |  18 +--
 lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp   |  29 ++---
 test/npu-xrt/nd_memcpy_linear_repeat/aie2.py  |  77 +++++++++++
 test/npu-xrt/nd_memcpy_linear_repeat/test.cpp | 122 ++++++++++++++++++
 4 files changed, 222 insertions(+), 24 deletions(-)
 create mode 100644 test/npu-xrt/nd_memcpy_linear_repeat/aie2.py
 create mode 100644 test/npu-xrt/nd_memcpy_linear_repeat/test.cpp

diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
index df1088e817..9be46cadef 100644
--- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp
+++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
@@ -185,10 +185,6 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
     return forOp->emitOpError(msg.str());
   }
 
-  if (skipTransformationChecks) {
-    return success();
-  }
-
   for (int i = 0; i < 3; i++) {
     if (inputSizes[i] > 1 && inputStrides[i] < 1) {
       // If inputSize[i] == 1, anything is allowable in the stride, since that
@@ -204,6 +200,10 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
     return forOp->emitOpError("Stride 3 must be a non-negative integer.");
   }
 
+  if (skipTransformationChecks) {
+    return success();
+  }
+
   for (int i = 0; i < 4; i++) {
     // strides[0] == 1 is ok iff the transfer size is a multiple of
     // addressGranularity, which is checked below
@@ -322,9 +322,10 @@ int64_t AIEX::NpuDmaMemcpyNdOp::getOffsetInBytes() {
   return offset;
 }
 
-// dma_memcpy_nd transfers of the form [1, 1, 1, len][0, 0, 0, 1] do not
+// dma_memcpy_nd transfers of the form [*, 1, 1, len][*, 0, 0, 1] do not
 // specify any data layout transformation, but simply express a contiguous
-// transfer of `len`.
+// transfer of `len`. We exclude checks to 4th dimension, because repeat count
+// is still possible without a data layout transformation.
 bool AIEX::NpuDmaMemcpyNdOp::isLinearTransferWithoutTransformation() {
   llvm::SmallVector<int64_t, 4> inputSizes =
       llvm::map_to_vector(llvm::reverse(getMixedSizes()), [](OpFoldResult s) {
@@ -334,9 +335,8 @@ bool AIEX::NpuDmaMemcpyNdOp::isLinearTransferWithoutTransformation() {
       llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
         return getConstantIntValue(s).value();
       });
-  return (inputSizes[1] == 1 && inputSizes[2] == 1 && inputSizes[3] == 1 &&
-          inputStrides[0] == 1 && inputStrides[1] == 0 &&
-          inputStrides[2] == 0 && inputStrides[3] == 0);
+  return (inputSizes[1] == 1 && inputSizes[2] == 1 && inputStrides[0] == 1 &&
+          inputStrides[1] == 0 && inputStrides[2] == 0);
 }
 
 LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
index b9390d6e59..ca6f7f1a69 100644
--- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
@@ -413,23 +413,22 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
 
       // d2_stride
       d2_stride = IntegerAttr::get(i32ty, strides[2]);
-
-      // iteration_current, iteration_size, iteration_stride, repeat_count
-      if (inputSizes[3] > 1) {
-        if (inputStrides[3] > 0) {
-          iteration_size = IntegerAttr::get(i32ty, sizes[3]);
-          iteration_stride = IntegerAttr::get(i32ty, strides[3]);
-        } else {
-          // We allow users to encode the repeat_count as a dimension 3 stride
-          // of 0. This must lower to a iteration wrap of 0, so no stride is
-          // ever added. We then repeat the BD using the repeat_count in
-          // NpuPushQueueOp.
-          iteration_size = zero;
-          iteration_stride = zero;
-        }
+    }
+    // iteration_current, iteration_size, iteration_stride, repeat_count
+    if (inputSizes[3] > 1) {
+      if (inputStrides[3] > 0) {
+        iteration_size = IntegerAttr::get(i32ty, sizes[3]);
+        iteration_stride = IntegerAttr::get(i32ty, strides[3]);
+      } else {
+        // We allow users to encode the repeat_count as a dimension 3 stride
+        // of 0. This must lower to a iteration wrap of 0, so no stride is
+        // ever added. We then repeat the BD using the repeat_count in
+        // NpuPushQueueOp.
+        iteration_size = zero;
+        iteration_stride = zero;
       }
-      repeat_count = IntegerAttr::get(i32ty, sizes[3]);
     }
+    repeat_count = IntegerAttr::get(i32ty, sizes[3]);
 
     // next_bd
 
diff --git a/test/npu-xrt/nd_memcpy_linear_repeat/aie2.py b/test/npu-xrt/nd_memcpy_linear_repeat/aie2.py
new file mode 100644
index 0000000000..b72a3bffae
--- /dev/null
+++ b/test/npu-xrt/nd_memcpy_linear_repeat/aie2.py
@@ -0,0 +1,77 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+# REQUIRES: ryzen_ai, valid_xchess_license
+#
+# RUN: %python %S/aie2.py > ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+# RUN: %run_on_npu ./test.exe | FileCheck %s
+# CHECK: PASS!
+
+import numpy as np
+from aie.extras.context import mlir_mod_ctx
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.helpers.dialects.ext.scf import _for as range_
+
+dtype = np.int16
+repeat_count = 3
+a_len = 2048
+c_len = a_len * repeat_count
+
+
+def design():
+
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.npu1_4col)
+        def device_body():
+            a_ty = np.ndarray[(a_len,), np.dtype[dtype]]
+            c_ty = np.ndarray[(c_len,), np.dtype[dtype]]
+
+            ShimTile = tile(0, 0)
+            ComputeTile = tile(0, 2)
+            fifo_a = object_fifo("fifo_a", ShimTile, ComputeTile, 2, a_ty)
+            fifo_c = object_fifo("fifo_c", ComputeTile, ShimTile, 2, a_ty)
+
+            # Core
+            @core(ComputeTile)
+            def core_body():
+                for _ in range_(0, 0xFFFFFFFF):
+                    for i in range_(repeat_count):
+                        elem_c = fifo_c.acquire(ObjectFifoPort.Produce, 1)
+                        elem_a = fifo_a.acquire(ObjectFifoPort.Consume, 1)
+                        for i in range_(a_len):
+                            elem_c[i] = elem_a[i]
+                        fifo_a.release(ObjectFifoPort.Consume, 1)
+                        fifo_c.release(ObjectFifoPort.Produce, 1)
+
+            # To/from AIE-array data movement
+            @runtime_sequence(a_ty, a_ty, c_ty)
+            def sequence(A, _B, C):
+                npu_dma_memcpy_nd(
+                    metadata=fifo_a,
+                    bd_id=1,
+                    mem=A,
+                    sizes=[repeat_count, 1, 1, a_len],
+                    strides=[0, 0, 0, 1],
+                )
+                npu_dma_memcpy_nd(
+                    metadata=fifo_c,
+                    bd_id=0,
+                    mem=C,
+                    sizes=[1, 1, 1, c_len],
+                    strides=[0, 0, 0, 1],
+                )
+                dma_wait(fifo_c)
+
+    print(ctx.module)
+
+
+design()
diff --git a/test/npu-xrt/nd_memcpy_linear_repeat/test.cpp b/test/npu-xrt/nd_memcpy_linear_repeat/test.cpp
new file mode 100644
index 0000000000..64a3be5b61
--- /dev/null
+++ b/test/npu-xrt/nd_memcpy_linear_repeat/test.cpp
@@ -0,0 +1,122 @@
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+#ifndef XCLBIN
+#define XCLBIN "final.xclbin"
+#endif
+
+#ifndef INSTS_TXT
+#define INSTS_TXT "insts.txt"
+#endif
+
+#ifndef KERNEL_NAME
+#define KERNEL_NAME "MLIR_AIE"
+#endif
+
+#define DTYPE int16_t
+#define A_DATATYPE DTYPE
+#define C_DATATYPE DTYPE
+
+#define A_LEN 2048
+#define REPEAT_COUNT 3
+#define C_LEN (A_LEN * REPEAT_COUNT)
+
+#define A_SIZE (A_LEN * sizeof(A_DATATYPE)) // in bytes
+#define B_SIZE A_SIZE                       // in bytes
+#define C_SIZE (C_LEN * sizeof(C_DATATYPE)) // in bytes
+
+int main(int argc, const char *argv[]) {
+
+  std::vector<uint32_t> instr_v = test_utils::load_instr_sequence(INSTS_TXT);
+  assert(instr_v.size() > 0);
+
+  // Get a device handle
+  unsigned int device_index = 0;
+  xrt::device device = xrt::device(device_index);
+
+  // Load the xclbin
+  xrt::xclbin xclbin = xrt::xclbin(XCLBIN);
+
+  // Get the kernel from the xclbin
+  std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
+  xrt::xclbin::kernel xkernel = *std::find_if(
+      xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
+        return k.get_name().rfind(KERNEL_NAME, 0) == 0;
+      });
+  std::string kernel_name = xkernel.get_name();
+  assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  auto kernel = xrt::kernel(context, kernel_name);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_a =
+      xrt::bo(device, A_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_b =
+      xrt::bo(device, B_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_c =
+      xrt::bo(device, C_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  A_DATATYPE *buf_a = bo_a.map<A_DATATYPE *>();
+  for (int i = 0; i < A_SIZE / sizeof(buf_a[0]); i++) {
+    buf_a[i] = 2 * i; // even
+  }
+  C_DATATYPE *buf_c = bo_c.map<C_DATATYPE *>();
+  memset(buf_c, 0, C_SIZE);
+
+  // Instruction buffer for DMA configuration
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_a.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_b.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_c.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_c);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  int errors = 0;
+  for (int i = 0; i < C_SIZE / sizeof(buf_c[0]); i++) {
+    std::cout << std::setw(4) << (long)buf_c[i] << " ";
+    if (buf_c[i] != buf_a[i % A_LEN]) {
+      errors += 1;
+    }
+  }
+  std::cout << std::endl;
+
+  if (errors == 0) {
+    std::cout << "PASS!" << std::endl;
+  } else {
+    std::cout << "FAIL." << std::endl;
+  }
+
+  return 0;
+}

From 4c54bd3b8c2f78f617e6b9b51e8da34905ea03cc Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 24 Oct 2024 16:01:58 -0600
Subject: [PATCH 6/6] Do more checks even when skipTransformationChecks==true

---
 lib/Dialect/AIEX/IR/AIEXDialect.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
index 9be46cadef..5ed2a2cd63 100644
--- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp
+++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
@@ -200,10 +200,6 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
     return forOp->emitOpError("Stride 3 must be a non-negative integer.");
   }
 
-  if (skipTransformationChecks) {
-    return success();
-  }
-
   for (int i = 0; i < 4; i++) {
     // strides[0] == 1 is ok iff the transfer size is a multiple of
     // addressGranularity, which is checked below
@@ -219,7 +215,7 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
     }
   }
 
-  if (hardwareSizes[0] > (1 << wrap_bits) - 1)
+  if (!skipTransformationChecks && hardwareSizes[0] > (1 << wrap_bits) - 1)
     return forOp->emitOpError(
         "Size 0 exceeds the [0:" + std::to_string((1 << wrap_bits) - 1) +
         "] range.");