Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dma_task in programming examples #1919

Draft
wants to merge 22 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
b6d7180
Start to port programming examples to use dma task
hunhoffe Nov 13, 2024
49fa9e1
remove unneeded field
hunhoffe Nov 13, 2024
b43a48d
Finish adding alternate (dma task) impls of programming_examples/vision
hunhoffe Nov 13, 2024
fb59e20
Add alt version for ml programming examples
hunhoffe Nov 13, 2024
bbc991f
Add convenience wrappers around dma_*_task functions
hunhoffe Nov 13, 2024
2bfb325
Start porting some of the basic examples to use the dma task structure
hunhoffe Nov 13, 2024
88a033f
Merge branch 'main' into port-examples-dma-task
hunhoffe Nov 13, 2024
774e0e6
Finish rewriting programming examples to use dma task
hunhoffe Nov 13, 2024
f95bf36
Fix for [1, 1, 1, N]
jgmelber Nov 14, 2024
174cdf0
Additional verification linear case patch
jgmelber Nov 14, 2024
bbcde4c
Default sizes to 1
jgmelber Nov 14, 2024
0d86176
Use uint32_t for sizes to match transfer length for dim 0
jgmelber Nov 14, 2024
c7f38ec
Apply suggestions from code review
jgmelber Nov 14, 2024
daa4598
Revert "Default sizes to 1"
jgmelber Nov 14, 2024
259b2a6
Init sizes for vec scalar mul
jgmelber Nov 14, 2024
10b386b
calculate transfer len with less lines of code
hunhoffe Nov 14, 2024
d3ee4e6
Merge branch 'main' into port-examples-dma-task
hunhoffe Nov 14, 2024
efbff0e
Merge branch 'main' into port-examples-dma-task
hunhoffe Nov 14, 2024
ee0fa1a
Remove lingering npu_dma_memcpy_nd from alt examples
hunhoffe Nov 14, 2024
5931e38
Attempt to use repeat count correctly in examples
hunhoffe Nov 15, 2024
22ec2f0
Merge branch 'main' into port-examples-dma-task
hunhoffe Nov 15, 2024
bdc73f0
Does not fix things, but update understanding of repeat count
hunhoffe Nov 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/aie/Dialect/AIE/IR/AIEAttrs.td
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def BDDimLayoutAttr : AttrDef<AIE_Dialect, "BDDimLayout", []> {
}];

let parameters = (ins
"uint16_t" : $size,
"uint32_t" : $size,
"uint32_t" : $stride
);

Expand Down
4 changes: 3 additions & 1 deletion lib/Dialect/AIEX/IR/AIEXDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,9 +443,11 @@ LogicalResult AIEX::NpuPushQueueOp::verify() {
LogicalResult AIEX::NpuWriteBdOp::verify() {
const auto &targetModel = AIE::getTargetModel(*this);
auto numBds = targetModel.getNumBDs(getColumn(), getRow());
bool isLinearTransfer =
(getD0Size() >= 1) && (getD1Size() == 1) && (getIterationSize() == 0);
if (getBdId() > numBds)
return emitOpError("BD ID exceeds the maximum ID.");
if (getD0Size() > 0x3FF)
if (!isLinearTransfer && getD0Size() > 0x3FF)
return emitOpError("D0 Size exceeds the [0:1023] range.");
if (getD0Stride() > 0xFFFFF)
return emitOpError("D0 Stride exceeds the [0:1M-1] range.");
Expand Down
4 changes: 3 additions & 1 deletion lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,8 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
return bd_op->emitOpError("At most four data layout transformation "
"dimensions may be provided.");
}
bool isLinearTransfer = (input_sizes[0] >= 1) && (input_sizes[1] == 1) &&
(input_sizes[2] == 1) && (input_sizes[3] == 1);
for (size_t i = 0; i < dims->size(); i++) {
// Pass down dimensions in reverse order; in the MLIR, this allows
// us to specify step sizes/wraps in the same order as we would
Expand All @@ -264,7 +266,7 @@ struct AIEDMATasksToNPUPass : AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {
input_strides, sizes, strides);
if (failed(verifyStridesWraps(bd_op, buffer_type, tile.getCol(),
tile.getRow(), input_sizes, input_strides,
sizes, strides))) {
sizes, strides, isLinearTransfer))) {
return failure();
}
// Ensure the total transfer length and the length expressed in the lowest
Expand Down
9 changes: 8 additions & 1 deletion programming_examples/basic/dma_transpose/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,14 @@ targetname = dmaTranspose
M ?= 64
K ?= 32

build/aie.mlir: ${srcdir}/aie2.py
aie_py_src=aie2.py
use_alt?=0

ifeq (${use_alt}, 1)
aie_py_src=aie2_alt.py
endif

build/aie.mlir: ${srcdir}/${aie_py_src}
mkdir -p ${@D}
python3 $< ${M} ${K} > $@

Expand Down
77 changes: 77 additions & 0 deletions programming_examples/basic/dma_transpose/aie2_alt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# dma_transpose/aie2.py -*- Python -*-
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
import numpy as np
import sys

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.extras.context import mlir_mod_ctx
from aie.helpers.dialects.ext.scf import _for as range_

N = 4096
M = 64
K = 64

if len(sys.argv) == 3:
M = int(sys.argv[1])
K = int(sys.argv[2])
N = M * K

tensor_ty = np.ndarray[(M, K), np.dtype[np.int32]]


def my_passthrough():
with mlir_mod_ctx() as ctx:

@device(AIEDevice.npu1_1col)
def device_body():
# Tile declarations
ShimTile = tile(0, 0)
ComputeTile2 = tile(0, 2)

# AIE-array data movement with object fifos
of_in = object_fifo("in", ShimTile, ComputeTile2, 2, tensor_ty)
of_out = object_fifo("out", ComputeTile2, ShimTile, 2, tensor_ty)
object_fifo_link(of_in, of_out)

# Set up compute tiles

# Compute tile 2
@core(ComputeTile2)
def core_body():
for _ in range_(sys.maxsize):
pass

# To/from AIE-array data movement
@runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
def sequence(A, B, C):
# The strides below are configured to read across all rows in the same column
# Stride of K in dim/wrap 2 skips an entire row to read a full column
in_task = dma_configure_task_for(of_in, issue_token=True)
with bds(in_task) as bd:
with bd[0]:
shim_dma_bd(
A,
sizes=[1, 1, K, M],
strides=[1, 1, 1, K],
)
EndOp()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you confirm if these EndOps are still necessary? I believe I've seen other places where the block terminators could be left off and that would clean this up, but it may not be possible here.

Copy link
Collaborator Author

@hunhoffe hunhoffe Nov 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can confirm though trying this in an example: EndOp is currently necessary.

Error without it is:

error: "-":21:9: block with no terminator, has "aie.dma_bd"(%arg0) <{dimensions = #aie<bd_dim_layout_array[<size = 1, stride = 1>, <size = 1, stride = 1>, <size = 32, stride = 1>, <size = 64, stride = 32>]>, len = 2048 : i32, offset = 0 : i32}> : (memref<64x32xi32>) -> ()
 note: "-":21:9: see current operation: "aie.dma_bd"(%arg0) <{dimensions = #aie<bd_dim_layout_array[<size = 1, stride = 1>, <size = 1, stride = 1>, <size = 32, stride = 1>, <size = 64, stride = 32>]>, len = 2048 : i32, offset = 0 : i32}> : (memref<64x32xi32>) -> ()

However, seen in the linked test, it seems like the EndOp is not necessary if you call next_bd but I haven't fully investigated: https://github.com/Xilinx/mlir-aie/blob/main/test/python/dma_tasks.py

I think with some additional time, I could clean this up. However, the python bindings I'm working on autogenerate the dma_task code, so the user will never see this anyways. At the moment, I don't see the EndOp cleanup as an urgent task at this time (but it should probably be done eventually).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, seen in the linked test, it seems like the EndOp is not necessary if you call next_bd

Because aie.next_bd is also a terminator.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

next_bd is also a terminator so that makes sense. I agree that this isn't important, just was curious.


out_task = dma_configure_task_for(of_out, issue_token=True)
with bds(out_task) as bd:
with bd[0]:
shim_dma_bd(C, sizes=[1, 1, 1, N])
EndOp()

dma_start_task(in_task, out_task)
dma_await_task(in_task, out_task)

print(ctx.module)


my_passthrough()
12 changes: 12 additions & 0 deletions programming_examples/basic/dma_transpose/run_makefile_alt.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// (c) Copyright 2024 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai, peano
//
// RUN: mkdir -p test_alt
// RUN: cd test_alt
// RUN: make -f %S/Makefile clean
// RUN: env use_alt=1 make -f %S/Makefile
// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
// CHECK: PASS!

Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ n_aie_cols?=4
kernels=mm_${m}x${k}x${n}
aieargs+=-m $m -k $k -n $n --n-aie-cols ${n_aie_cols}
target_suffix=${M}x${K}x${N}_${m}x${k}x${n}_${n_aie_cols}c
use_alt?=0

ifeq (${use_alt}, 1)
aie_py_src=aie2_alt.py
endif

include ${srcdir}/../makefile-common

Expand Down
Loading
Loading