diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td index 2e1b2b1d32..307e91ace2 100644 --- a/include/aie/Dialect/AIEX/IR/AIEX.td +++ b/include/aie/Dialect/AIEX/IR/AIEX.td @@ -494,9 +494,9 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [ let description = [{ An n-dimensional half DMA operator. - Programs a DMA on coordinates (`x`, `y`) to access a memory `memref` with an access - pattern specified by `offsets`, `sizes` and `strides` or `static_offsets`, `static_sizes` - and `static_strides`. The operator references the target channel through the `metadata` + Programs a DMA to access a memory `memref` with an access pattern specified by `offsets`, + `sizes` and `strides` or `static_offsets`, `static_sizes` and `static_strides`. The operator + references the target DMA coordinates (`x`, `y`) and channel through the `metadata` symbol and specifies a descriptor `id` to be used, which will become the `bd_id` to be used when lowered further. The `issue_token` attribute specifies whether the execution of this operation should issue a token which can be received and read for synchronization purposes. @@ -557,9 +557,7 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [ }]; let arguments = ( - ins I64Attr:$x, - I64Attr:$y, - AnyMemRef:$memref, + ins AnyMemRef:$memref, // NOTE: these are in reverse order: offset3, offset2, ... Variadic:$offsets, Variadic:$sizes, @@ -580,7 +578,7 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [ ); let assemblyFormat = [{ - `(` $x `,` $y `,` $memref `` + `(` $memref `` custom($offsets, $static_offsets) `` custom($sizes, $static_sizes) `` custom($strides, $static_strides) `` diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index 45181b4a35..3225c539fb 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -370,10 +370,6 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() { llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) { return getConstantIntValue(s).value(); }); - llvm::SmallVector hardwareSizes(4); - llvm::SmallVector hardwareStrides(4); - getHardwareStridesWraps(targetModel, buffer, inputSizes, inputStrides, - hardwareSizes, hardwareStrides); int64_t offset = getOffsetInBytes(); // The experimental HSA target uses this op on AIE1, skip all the AIE2 @@ -385,19 +381,6 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() { return emitOpError("Offset must be 4-byte-aligned."); } - // dma_memcpy_nd transfers of the form [1, 1, 1, len][0, 0, 0, 1] do not - // specify any data layout transformation, but simply express a contiguous - // transfer of `len`. For backwards compatibility, we allow this to proceed - // even if it exceeds the maximum stride/wrap size of any one dimension, - // and simply do not lower any data layout transformations, since there is - // no other way to express this at the dma_memcpy_nd interface otherwise. - bool skipTransformationChecks = isLinearTransferWithoutTransformation(); - if (failed(verifyStridesWraps(*this, buffer, getX(), getY(), inputSizes, - inputStrides, hardwareSizes, hardwareStrides, - skipTransformationChecks))) { - return failure(); - } - // packet header if (auto packetInfo = getPacket()) { if (packetInfo->getPktType() > 7) diff --git a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp index c064ad6702..bb35c9d493 100644 --- a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp +++ b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp @@ -125,11 +125,11 @@ struct AIECtrlPacketToDmaPass : AIECtrlPacketToDmaBase { StringRef metadata = builder.getStringAttr(shimDmaAllocName); builder.create( - builder.getUnknownLoc(), 0, 0, newBlockArg, + builder.getUnknownLoc(), newBlockArg, SmallVector{}, SmallVector{}, SmallVector{}, - SmallVector{}, ArrayRef(staticOffsets), - ArrayRef(staticSizes), ArrayRef(staticStrides), - controllerIdPkt, metadata, 0, true, 0, 0, 0, 0, 0, 0); + ArrayRef(staticOffsets), ArrayRef(staticSizes), + ArrayRef(staticStrides), controllerIdPkt, metadata, 0, true, + 0, 0, 0, 0, 0, 0); auto shimRow = builder.getI32IntegerAttr(0); auto shimCol = builder.getI32IntegerAttr(col); diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index b18fd12ebe..270f359110 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -359,6 +359,19 @@ struct DmaToNpuPattern : OpConversionPattern { // row row = IntegerAttr::get(i32ty, 0); + // dma_memcpy_nd transfers of the form [1, 1, 1, len][0, 0, 0, 1] do not + // specify any data layout transformation, but simply express a contiguous + // transfer of `len`. For backwards compatibility, we allow this to proceed + // even if it exceeds the maximum stride/wrap size of any one dimension, + // and simply do not lower any data layout transformations, since there is + // no other way to express this at the dma_memcpy_nd interface otherwise. + bool skipTransformationChecks = op.isLinearTransferWithoutTransformation(); + if (failed(verifyStridesWraps(op, bufferType, col, 0, inputSizes, + inputStrides, sizes, strides, + skipTransformationChecks))) { + return failure(); + } + // arg_idx AIEX::RuntimeSequenceOp seq_op = op->getParentOfType(); diff --git a/python/dialects/aiex.py b/python/dialects/aiex.py index c0ab6fcfce..5fe927aa28 100644 --- a/python/dialects/aiex.py +++ b/python/dialects/aiex.py @@ -63,8 +63,6 @@ def __init__( strides: MixedValues | None = None, issue_token: bool | None = None, ): - x = 0 - y = 0 if tap and not (offsets is None and sizes is None and strides is None): raise ValueError( "NpuDmaMemcpyNd can take either a TileAccessPattern OR (sizes and/or strides and/or offsets), but not both." @@ -92,8 +90,6 @@ def __init__( if isinstance(metadata, ObjectFifoCreateOp): metadata = metadata.sym_name.value super().__init__( - x, - y, mem, dynamic_offsets, dynamic_sizes, diff --git a/test/Conversion/DmaToNpu/aiert_insts.mlir b/test/Conversion/DmaToNpu/aiert_insts.mlir index 5462a6abf2..47755828da 100644 --- a/test/Conversion/DmaToNpu/aiert_insts.mlir +++ b/test/Conversion/DmaToNpu/aiert_insts.mlir @@ -24,8 +24,8 @@ module { %c8 = arith.constant 8 : i64 %c16 = arith.constant 16 : i64 %c32 = arith.constant 32 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0, %c1]) { metadata = @of_toMem, id = 1 : i64, issue_token = true } : memref<64xi32> - aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8, %c1]) { metadata = @of_fromMem, id = 0 : i64, issue_token = false } : memref<4x2x8xi32> + aiex.npu.dma_memcpy_nd (%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0, %c1]) { metadata = @of_toMem, id = 1 : i64, issue_token = true } : memref<64xi32> + aiex.npu.dma_memcpy_nd (%in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8, %c1]) { metadata = @of_fromMem, id = 0 : i64, issue_token = false } : memref<4x2x8xi32> } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) aie.shim_dma_allocation @of_toMem (S2MM, 0, 0) diff --git a/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir b/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir index a6e6663f73..1c31a8e545 100644 --- a/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir +++ b/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir @@ -19,7 +19,7 @@ module @shimDmaMemcpy{ aie.device(xcve2302) { memref.global "public" @toMem : memref<1xbf16> aiex.runtime_sequence(%arg0: memref<1xbf16>, %arg1: memref<1xbf16>, %arg2: memref<1xbf16>) { - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<1xbf16> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<1xbf16> aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} } aie.shim_dma_allocation @toMem (S2MM, 0, 0) diff --git a/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir b/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir index fd99c22dd8..cefa229913 100644 --- a/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir +++ b/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir @@ -19,7 +19,7 @@ module @shimDmaMemcpy{ aie.device(xcve2302) { memref.global "public" @toMem : memref<65536xi64> aiex.runtime_sequence(%arg0: memref<65536xi64>, %arg1: memref<65536xi64>, %arg2: memref<65536xi64>) { - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<65536xi64> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<65536xi64> aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} } aie.shim_dma_allocation @toMem (S2MM, 0, 0) diff --git a/test/Conversion/DmaToNpu/dma_to_npu.mlir b/test/Conversion/DmaToNpu/dma_to_npu.mlir index 98503ea174..3f16346f9f 100644 --- a/test/Conversion/DmaToNpu/dma_to_npu.mlir +++ b/test/Conversion/DmaToNpu/dma_to_npu.mlir @@ -23,8 +23,8 @@ module { memref.global "public" @toMem : memref<16xi32> memref.global "public" @fromMem : memref<16xi32> aiex.runtime_sequence(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> - aiex.npu.dma_memcpy_nd (0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> + aiex.npu.dma_memcpy_nd (%arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32> } aie.shim_dma_allocation @fromMem (MM2S, 0, 0) aie.shim_dma_allocation @toMem (S2MM, 0, 0) @@ -49,7 +49,7 @@ module { aie.device(npu1_4col) { memref.global "public" @toMem : memref<16xi32> aiex.runtime_sequence(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { issue_token = true, metadata = @toMem, id = 1 : i64 } : memref<16xi32> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { issue_token = true, metadata = @toMem, id = 1 : i64 } : memref<16xi32> aiex.npu.dma_wait {symbol = @toMem} } aie.shim_dma_allocation @toMem (S2MM, 0, 0) @@ -75,7 +75,7 @@ module { aie.device(npu1_4col) { memref.global "public" @toMem : memref<16xi32> aiex.runtime_sequence(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { issue_token = true, metadata = @toMem, id = 1 : i64 } : memref<16xi32> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { issue_token = true, metadata = @toMem, id = 1 : i64 } : memref<16xi32> aiex.npu.dma_wait {symbol = @toMem} } aie.shim_dma_allocation @toMem (MM2S, 1, 1) @@ -139,7 +139,7 @@ module { aie.device(npu1_1col) { memref.global "public" @toMem : memref<16xi32> aiex.runtime_sequence(%arg0: memref<16xi32>) { - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1], packet = ) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1], packet = ) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> } aie.shim_dma_allocation @toMem (S2MM, 0, 0) } diff --git a/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir b/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir index 76f6a0ed88..e0cbb421b5 100644 --- a/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir +++ b/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir @@ -26,8 +26,8 @@ module { memref.global "public" @toMem : memref<16xi32> memref.global "public" @fromMem : memref<16xi32> aiex.runtime_sequence(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64, issue_token = true } : memref<16xi32> - aiex.npu.dma_memcpy_nd (0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @fromMem, id = 0 : i64, issue_token = false } : memref<16xi32> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64, issue_token = true } : memref<16xi32> + aiex.npu.dma_memcpy_nd (%arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @fromMem, id = 0 : i64, issue_token = false } : memref<16xi32> } aie.shim_dma_allocation @fromMem (MM2S, 0, 0) aie.shim_dma_allocation @toMem (S2MM, 0, 0) diff --git a/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir b/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir index 9c070eba02..9f5dc20e73 100644 --- a/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir +++ b/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir @@ -30,7 +30,7 @@ module @shimDmaMemcpy{ aie.device(xcve2302) { memref.global "public" @toMem : memref<65536xbf16> aiex.runtime_sequence(%arg0: memref<65536xbf16>, %arg1: memref<65536xbf16>, %arg2: memref<65536xbf16>) { - aiex.npu.dma_memcpy_nd (2, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<65536xbf16> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<65536xbf16> aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} } aie.shim_dma_allocation @toMem (S2MM, 0, 2) diff --git a/test/Targets/AIETargetHSA/input_with_addresses.mlir b/test/Targets/AIETargetHSA/input_with_addresses.mlir index be4ece326b..2ed8f879c6 100644 --- a/test/Targets/AIETargetHSA/input_with_addresses.mlir +++ b/test/Targets/AIETargetHSA/input_with_addresses.mlir @@ -55,8 +55,8 @@ module { aie.shim_dma_allocation @out0(S2MM, 0, 6) aiex.runtime_sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) { - aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 0 : i64, metadata = @out0} : memref<64xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 1 : i64, metadata = @in0} : memref<64xi32> + aiex.npu.dma_memcpy_nd (%arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 0 : i64, metadata = @out0} : memref<64xi32> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 1 : i64, metadata = @in0} : memref<64xi32> aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} } } diff --git a/test/Targets/NPU/npu_dma_memcpy.mlir b/test/Targets/NPU/npu_dma_memcpy.mlir index 8d048d86c3..833fda29ea 100644 --- a/test/Targets/NPU/npu_dma_memcpy.mlir +++ b/test/Targets/NPU/npu_dma_memcpy.mlir @@ -24,7 +24,7 @@ module { aie.shim_dma_allocation @airMemcpyId12(MM2S, 0, 0) memref.global "public" @airMemcpyId12 : memref<1x2x1x32x32xi32, 1 : i32> aiex.runtime_sequence (%arg0: memref<2x64x64xi32>, %arg1: memref<2x64x64xi32>, %arg2: memref<2x64x64xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[1, 0, 0, 0][1, 2, 32, 32][4096, 2048, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId12} : memref<2x64x64xi32> + aiex.npu.dma_memcpy_nd(%arg0[1, 0, 0, 0][1, 2, 32, 32][4096, 2048, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId12} : memref<2x64x64xi32> } } } diff --git a/test/aiecc/buffers_xclbin.mlir b/test/aiecc/buffers_xclbin.mlir index 24e3bf1642..a4087a5d92 100644 --- a/test/aiecc/buffers_xclbin.mlir +++ b/test/aiecc/buffers_xclbin.mlir @@ -94,14 +94,14 @@ module { %12 = aie.tile(1, 2) %22 = aie.tile(2, 2) aiex.runtime_sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>, %arg3: memref<1024xi32>, %arg4: memref<1024xi32>, %arg5: memref<1024xi32>) { - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, metadata = @in0} : memref<1024xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 1 : i64, metadata = @out0} : memref<1024xi32> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, metadata = @in0} : memref<1024xi32> + aiex.npu.dma_memcpy_nd (%arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 1 : i64, metadata = @out0} : memref<1024xi32> aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 2 : i64, metadata = @in1} : memref<1024xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg3[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 3 : i64, metadata = @out1} : memref<1024xi32> + aiex.npu.dma_memcpy_nd (%arg2[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 2 : i64, metadata = @in1} : memref<1024xi32> + aiex.npu.dma_memcpy_nd (%arg3[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 3 : i64, metadata = @out1} : memref<1024xi32> aiex.npu.sync {channel = 1 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd (0, 0, %arg4[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 2 : i64, metadata = @in2} : memref<1024xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg5[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 3 : i64, metadata = @out2} : memref<1024xi32> + aiex.npu.dma_memcpy_nd (%arg4[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 2 : i64, metadata = @in2} : memref<1024xi32> + aiex.npu.dma_memcpy_nd (%arg5[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 3 : i64, metadata = @out2} : memref<1024xi32> aiex.npu.sync {channel = 0 : i32, column = 2 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} } } diff --git a/test/dialect/AIEX/bad_npu_nd.mlir b/test/dialect/AIEX/bad_npu_nd.mlir index 5a29df773c..fb0ee285c3 100644 --- a/test/dialect/AIEX/bad_npu_nd.mlir +++ b/test/dialect/AIEX/bad_npu_nd.mlir @@ -19,7 +19,7 @@ module { %c1920 = arith.constant 1920 : i64 %c1080 = arith.constant 1080 : i64 // expected-error@+1 {{Size 0 exceeds the [0:1023] range}} - aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi32> + aiex.npu.dma_memcpy_nd (%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi32> } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) } @@ -39,7 +39,7 @@ module { %c32 = arith.constant 32 : i64 %c128 = arith.constant 128 : i64 // expected-error@+1 {{Size 3 exceeds the [1:64] range}} - aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c128,%c2,%c2,%c8][%c0,%c16,%c8,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<128x4x2x8xi32> + aiex.npu.dma_memcpy_nd (%in[%c0,%c0,%c0,%c0][%c128,%c2,%c2,%c8][%c0,%c16,%c8,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<128x4x2x8xi32> } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) } @@ -55,7 +55,7 @@ module { %c2 = arith.constant 2 : i64 %c2097152 = arith.constant 2097152 : i64 // expected-error@+1 {{Stride 1 exceeds the [1:1048576] range}} - aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2][%c0,%c0,%c2097152,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<8388608xi32> + aiex.npu.dma_memcpy_nd (%in[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2][%c0,%c0,%c2097152,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<8388608xi32> } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) } @@ -73,7 +73,7 @@ module { %c2 = arith.constant 2 : i64 %c8 = arith.constant 8 : i64 // expected-error@+1 {{Offset must be 4-byte-aligned}} - aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c1][%c1,%c1,%c1,%c8][%c0,%c0,%c1,%c1]) { metadata = @fifo, id = 0 : i64 } : memref<8xi8> + aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c1][%c1,%c1,%c1,%c8][%c0,%c0,%c1,%c1]) { metadata = @fifo, id = 0 : i64 } : memref<8xi8> } aie.shim_dma_allocation @fifo (MM2S, 0, 0) } @@ -95,7 +95,7 @@ module { %c2048 = arith.constant 2048 : i64 // Although 2048 exceeds the 0:1023 limit for size 0, since the elements are i8s, // this should be a size of 512 in address granularity (4 bytes) and hence pass the test. - aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8> + aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8> } aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) } @@ -113,7 +113,7 @@ module { %c8 = arith.constant 8 : i64 %c2048 = arith.constant 2048 : i64 // expected-error@+1 {{Size 0 exceeds the [0:1023] range}} - aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16> + aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16> } aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) } @@ -132,7 +132,7 @@ module { %c2 = arith.constant 2 : i64 // Stride of 2 i8s = 2 bytes < 4 byte granularity, should not be possible %c8 = arith.constant 8 : i64 // expected-error@+1 {{Stride 1 is 2 elements * 1 bytes = 2 bytes, which is not divisible by 4}} - aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c2,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8> + aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c2,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8> } aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) } @@ -149,7 +149,7 @@ module { %c4 = arith.constant 4 : i64 %c8 = arith.constant 8 : i64 // expected-error@+1 {{2 elements at 1 bytes each equal 2 bytes, which is not divisible by 4}} - aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8> + aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8> } aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) } @@ -168,7 +168,7 @@ module { %c4 = arith.constant 4 : i64 %c8 = arith.constant 8 : i64 // expected-error@+1 {{Stride 0 is 2 elements * 1 bytes = 2 bytes, which is not divisible by 4}} - aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c0,%c2]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8> + aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c0,%c2]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8> } aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) } @@ -186,7 +186,7 @@ module { %c3 = arith.constant 3 : i64 %c8 = arith.constant 8 : i64 // expected-error@+1 {{3 elements at 2 bytes each equal 6 bytes, which is not divisible by 4}} - aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c3][%c0,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16> + aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c3][%c0,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16> } aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) } @@ -204,7 +204,7 @@ module { %c4 = arith.constant 4 : i64 %c8 = arith.constant 8 : i64 // expected-error@+1 {{Unsupported tile type at (0, 0) Must be ShimNOC, Mem or Core.}} - aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c4][%c0,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16> + aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c4][%c0,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16> } aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) } @@ -223,9 +223,9 @@ module { %c3 = arith.constant 3 : i64 %c8 = arith.constant 8 : i64 %c1572864 = arith.constant 1572864 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %a[%c1,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c1572864,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi32> + aiex.npu.dma_memcpy_nd (%a[%c1,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c1572864,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi32> // expected-error@+1 {{Stride 3 exceeds the [1:1048576] range.}} - aiex.npu.dma_memcpy_nd (0, 0, %a[%c1,%c0,%c0,%c0][%c2,%c1,%c1,%c2][%c1572864,%c0,%c0,%c1]) { metadata = @objectfifo, id = 1 : i64 } : memref<8xi32> + aiex.npu.dma_memcpy_nd (%a[%c1,%c0,%c0,%c0][%c2,%c1,%c1,%c2][%c1572864,%c0,%c0,%c1]) { metadata = @objectfifo, id = 1 : i64 } : memref<8xi32> } aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) } @@ -243,9 +243,9 @@ module { %c2 = arith.constant 2 : i64 %c3 = arith.constant 3 : i64 %c8 = arith.constant 8 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %a[%c1,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c0,%c0,%c0,%c1], packet = ) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi32> + aiex.npu.dma_memcpy_nd (%a[%c1,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c0,%c0,%c0,%c1], packet = ) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi32> // expected-error@+1 {{Packet ID field can only hold 5 bits.}} - aiex.npu.dma_memcpy_nd (0, 0, %a[%c1,%c0,%c0,%c0][%c2,%c1,%c1,%c2][%c0,%c0,%c0,%c1], packet = ) { metadata = @objectfifo, id = 1 : i64 } : memref<8xi32> + aiex.npu.dma_memcpy_nd (%a[%c1,%c0,%c0,%c0][%c2,%c1,%c1,%c2][%c0,%c0,%c0,%c1], packet = ) { metadata = @objectfifo, id = 1 : i64 } : memref<8xi32> } aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) } @@ -264,7 +264,7 @@ module { %c3 = arith.constant 3 : i64 %c8 = arith.constant 8 : i64 // expected-error@+1 {{Packet type field can only hold 3 bits.}} - aiex.npu.dma_memcpy_nd (0, 0, %a[%c1,%c0,%c0,%c0][%c2,%c1,%c1,%c2][%c0,%c0,%c0,%c1], packet = ) { metadata = @objectfifo, id = 1 : i64 } : memref<8xi32> + aiex.npu.dma_memcpy_nd (%a[%c1,%c0,%c0,%c0][%c2,%c1,%c1,%c2][%c0,%c0,%c0,%c1], packet = ) { metadata = @objectfifo, id = 1 : i64 } : memref<8xi32> } aie.shim_dma_allocation @objectfifo (MM2S, 0, 0) } diff --git a/test/dialect/AIEX/ctrl_pkt_to_dma.mlir b/test/dialect/AIEX/ctrl_pkt_to_dma.mlir index d8e8a4c44f..7ceac8d607 100644 --- a/test/dialect/AIEX/ctrl_pkt_to_dma.mlir +++ b/test/dialect/AIEX/ctrl_pkt_to_dma.mlir @@ -14,7 +14,7 @@ // CHECK-LABEL: aie.device(npu1_1col) { // CHECK: aiex.runtime_sequence(%[[ARG0:.*]]: memref) { -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref +// CHECK: aiex.npu.dma_memcpy_nd(%[[ARG0]][0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref // CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} aie.device(npu1_1col) { @@ -30,7 +30,7 @@ aie.device(npu1_1col) { // CHECK-LABEL: aie.device(npu1_1col) { // CHECK: aiex.runtime_sequence(%[[ARG0:.*]]: memref) { -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref +// CHECK: aiex.npu.dma_memcpy_nd(%[[ARG0]][0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt_col0_mm2s_chan0} : memref // CHECK: aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} aie.device(npu1_1col) { diff --git a/test/lower-to-standard/aiex_standard_lowering.mlir b/test/lower-to-standard/aiex_standard_lowering.mlir index 4545259d3e..433f925e1d 100644 --- a/test/lower-to-standard/aiex_standard_lowering.mlir +++ b/test/lower-to-standard/aiex_standard_lowering.mlir @@ -16,7 +16,7 @@ module { aie.device(npu1_4col) { memref.global "public" @toMem : memref<16xi32> aiex.runtime_sequence(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> aiex.npu.dma_wait {symbol = @toMem} } aie.shim_dma_allocation @toMem (MM2S, 1, 1) diff --git a/test/npu-xrt/add_12_i8_using_2d_dma_op_with_padding/aie.mlir b/test/npu-xrt/add_12_i8_using_2d_dma_op_with_padding/aie.mlir index f4743e3ced..4987e4b123 100644 --- a/test/npu-xrt/add_12_i8_using_2d_dma_op_with_padding/aie.mlir +++ b/test/npu-xrt/add_12_i8_using_2d_dma_op_with_padding/aie.mlir @@ -62,8 +62,8 @@ module { %c56_i64 = arith.constant 56 : i64 %c61_i64 = arith.constant 61 : i64 %c64_i64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c61_i64, %c56_i64][%c0_i64, %c0_i64, %c56_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c61_i64, %c56_i64][%c0_i64, %c0_i64, %c56_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> aiex.npu.dma_wait { symbol = @objFifo_out0 } } diff --git a/test/npu-xrt/add_21_i8_using_dma_op_with_padding/aie.mlir b/test/npu-xrt/add_21_i8_using_dma_op_with_padding/aie.mlir index 48af170aa8..ff70c8738e 100644 --- a/test/npu-xrt/add_21_i8_using_dma_op_with_padding/aie.mlir +++ b/test/npu-xrt/add_21_i8_using_dma_op_with_padding/aie.mlir @@ -69,8 +69,8 @@ module { %c1_i64 = arith.constant 1 : i64 %c32_i64 = arith.constant 32 : i64 %c64_i64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c32_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi8> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c32_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi8> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi8> aiex.npu.dma_wait { symbol = @objFifo_out0 } } diff --git a/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir index 1fc7df0961..a3571c3645 100644 --- a/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir +++ b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir @@ -103,8 +103,8 @@ module { aie.shim_dma_allocation @data_in(MM2S, 0, 0) aie.shim_dma_allocation @data_out(S2MM, 0, 0) aiex.runtime_sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) { - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 0 : i64, metadata = @data_in} : memref<64xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 1 : i64, metadata = @data_out, issue_token = true} : memref<64xi32> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 0 : i64, metadata = @data_in} : memref<64xi32> + aiex.npu.dma_memcpy_nd (%arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 1 : i64, metadata = @data_out, issue_token = true} : memref<64xi32> aiex.npu.dma_wait {symbol = @data_out} } } diff --git a/test/npu-xrt/add_314_using_dma_op/aie.mlir b/test/npu-xrt/add_314_using_dma_op/aie.mlir index 544f4fc837..19aece87c2 100644 --- a/test/npu-xrt/add_314_using_dma_op/aie.mlir +++ b/test/npu-xrt/add_314_using_dma_op/aie.mlir @@ -68,8 +68,8 @@ module { %c0_i64 = arith.constant 0 : i64 %c1_i64 = arith.constant 1 : i64 %c64_i64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi32> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi32> aiex.npu.dma_wait { symbol = @objFifo_out0 } } diff --git a/test/npu-xrt/add_378_i32_using_dma_op_with_padding/aie.mlir b/test/npu-xrt/add_378_i32_using_dma_op_with_padding/aie.mlir index 0c058cd4d1..12957ebc55 100644 --- a/test/npu-xrt/add_378_i32_using_dma_op_with_padding/aie.mlir +++ b/test/npu-xrt/add_378_i32_using_dma_op_with_padding/aie.mlir @@ -69,8 +69,8 @@ module { %c1_i64 = arith.constant 1 : i64 %c52_i64 = arith.constant 52 : i64 %c64_i64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c52_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi32> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c52_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi32> aiex.npu.dma_wait {symbol = @objFifo_out0} } diff --git a/test/npu-xrt/add_blockwrite/aie.mlir b/test/npu-xrt/add_blockwrite/aie.mlir index 63d8a0373c..aba3d56565 100644 --- a/test/npu-xrt/add_blockwrite/aie.mlir +++ b/test/npu-xrt/add_blockwrite/aie.mlir @@ -78,8 +78,8 @@ module { %0 = memref.get_global @myData : memref<8xi32> aiex.npu.blockwrite(%0) {buffer = @constant_buffer, address = 0 : ui32} : memref<8xi32> aiex.npu.write32 {buffer = @constant_buffer, address = 4 : ui32, value = 42 : ui32} - aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, issue_token = true, metadata = @objFifo_in0} : memref<64xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, issue_token = true, metadata = @objFifo_out0} : memref<64xi32> + aiex.npu.dma_memcpy_nd(%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, issue_token = true, metadata = @objFifo_in0} : memref<64xi32> + aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, issue_token = true, metadata = @objFifo_out0} : memref<64xi32> aiex.npu.dma_wait {symbol = @objFifo_in0} aiex.npu.dma_wait {symbol = @objFifo_out0} } diff --git a/test/npu-xrt/add_maskwrite/aie.mlir b/test/npu-xrt/add_maskwrite/aie.mlir index 2562779a42..831697ad2b 100644 --- a/test/npu-xrt/add_maskwrite/aie.mlir +++ b/test/npu-xrt/add_maskwrite/aie.mlir @@ -71,7 +71,7 @@ module { aiex.npu.maskwrite32 {row = 2 : i32, column = 0 : i32, address = 1024 : ui32, value = 0x12345678 : ui32, mask = 0xF0F0F0F0 : ui32} aiex.npu.maskwrite32 {buffer = @input_buffer, address = 1 : ui32, value = 0x9ABCDEF0 : ui32, mask = 0x0F0F0F0F : ui32} - aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, issue_token = true, metadata = @out0} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, issue_token = true, metadata = @out0} : memref<8xi32> aiex.npu.write32 { row = 2 : i32, column = 0 : i32, address = 0x0001F000 : ui32, value = 1 : ui32 } aiex.npu.dma_wait {symbol = @out0} } diff --git a/test/npu-xrt/add_one_ctrl_packet/aie.mlir b/test/npu-xrt/add_one_ctrl_packet/aie.mlir index 20e55c0f5b..80396000e8 100644 --- a/test/npu-xrt/add_one_ctrl_packet/aie.mlir +++ b/test/npu-xrt/add_one_ctrl_packet/aie.mlir @@ -121,8 +121,8 @@ module { // aiex.npu.maskwrite32 {address = 0x00060000 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32, mask = 0x8 : ui32} // start reading output - aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, issue_token = true, metadata = @ctrl0} : memref<8xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 2 : i64, issue_token = true, metadata = @out0} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, issue_token = true, metadata = @ctrl0} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 2 : i64, issue_token = true, metadata = @out0} : memref<8xi32> // write bd0 %0 = memref.get_global @blockwrite_data_0 : memref<8xi32> diff --git a/test/npu-xrt/add_one_ctrl_packet_4_cores/aie.mlir b/test/npu-xrt/add_one_ctrl_packet_4_cores/aie.mlir index e96d24025e..effc8da770 100644 --- a/test/npu-xrt/add_one_ctrl_packet_4_cores/aie.mlir +++ b/test/npu-xrt/add_one_ctrl_packet_4_cores/aie.mlir @@ -358,24 +358,24 @@ module { %c24_i64 = arith.constant 24 : i64 // start reading output - aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, issue_token = true, metadata = @ctrl0} : memref<8xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 2 : i64, issue_token = true, metadata = @out0} : memref<32xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c8_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 3 : i64, issue_token = true, metadata = @out1} : memref<32xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c16_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 4 : i64, issue_token = true, metadata = @out2} : memref<32xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c24_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 5 : i64, issue_token = true, metadata = @out3} : memref<32xi32> + aiex.npu.dma_memcpy_nd(%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, issue_token = true, metadata = @ctrl0} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 2 : i64, issue_token = true, metadata = @out0} : memref<32xi32> + aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c8_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 3 : i64, issue_token = true, metadata = @out1} : memref<32xi32> + aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c16_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 4 : i64, issue_token = true, metadata = @out2} : memref<32xi32> + aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c24_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 5 : i64, issue_token = true, metadata = @out3} : memref<32xi32> // write bd0 - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 6 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c4_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 7 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c8_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 8 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c12_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 9 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 6 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c4_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 7 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c8_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 8 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c12_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 9 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} // patch bd0 address for packet 1, push to mm2s_0_task_queue, wait - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c2_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 6 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c6_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 7 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c10_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 8 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c14_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 9 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c2_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 6 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c6_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 7 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c10_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 8 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c14_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 9 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} // wait for dma output diff --git a/test/npu-xrt/add_one_ctrl_packet_col_overlay/aie.mlir b/test/npu-xrt/add_one_ctrl_packet_col_overlay/aie.mlir index 0bb576a7f4..84b2fb691e 100644 --- a/test/npu-xrt/add_one_ctrl_packet_col_overlay/aie.mlir +++ b/test/npu-xrt/add_one_ctrl_packet_col_overlay/aie.mlir @@ -342,29 +342,29 @@ module { %c24_i64 = arith.constant 24 : i64 // start reading output - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 2 : i64, issue_token = true, metadata = @out0} : memref<32xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c8_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 3 : i64, issue_token = true, metadata = @out1} : memref<32xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c16_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 4 : i64, issue_token = true, metadata = @out2} : memref<32xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c24_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 5 : i64, issue_token = true, metadata = @out3} : memref<32xi32> + aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 2 : i64, issue_token = true, metadata = @out0} : memref<32xi32> + aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c8_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 3 : i64, issue_token = true, metadata = @out1} : memref<32xi32> + aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c16_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 4 : i64, issue_token = true, metadata = @out2} : memref<32xi32> + aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c24_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 5 : i64, issue_token = true, metadata = @out3} : memref<32xi32> // write bd0 - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 6 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 6 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c4_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 7 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c4_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 7 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> aiex.npu.sync {channel = 1 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c8_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 8 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c8_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 8 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> aiex.npu.sync {channel = 1 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c12_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 9 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c12_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 9 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> aiex.npu.sync {channel = 1 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} // patch bd0 address for packet 1, push to mm2s_0_task_queue, wait - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c2_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 6 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c2_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 6 : i64, issue_token = true, metadata = @ctrlin0} : memref<8xi32> aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c6_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 7 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c6_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 7 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> aiex.npu.sync {channel = 1 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c10_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 8 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c10_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 8 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> aiex.npu.sync {channel = 1 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c14_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 9 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> + aiex.npu.dma_memcpy_nd(%arg1[%c0_i64, %c0_i64, %c0_i64, %c14_i64] [%c1_i64, %c1_i64, %c1_i64, %c2_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64], packet = ) {id = 9 : i64, issue_token = true, metadata = @ctrlin1} : memref<8xi32> aiex.npu.sync {channel = 1 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} // wait for dma output diff --git a/test/npu-xrt/add_one_objFifo/aie.mlir b/test/npu-xrt/add_one_objFifo/aie.mlir index b7ab02a0bc..e0b31a5b9a 100644 --- a/test/npu-xrt/add_one_objFifo/aie.mlir +++ b/test/npu-xrt/add_one_objFifo/aie.mlir @@ -47,8 +47,8 @@ module { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> - aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64, issue_token = true } : memref<64xi32> + aiex.npu.dma_memcpy_nd (%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> + aiex.npu.dma_memcpy_nd (%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64, issue_token = true } : memref<64xi32> aiex.npu.dma_wait { symbol = @objFifo_out0 } } } diff --git a/test/npu-xrt/add_one_two/aie1.mlir b/test/npu-xrt/add_one_two/aie1.mlir index 95eef13574..f39bd5bf02 100644 --- a/test/npu-xrt/add_one_two/aie1.mlir +++ b/test/npu-xrt/add_one_two/aie1.mlir @@ -44,8 +44,8 @@ module { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<64xi32> - aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> + aiex.npu.dma_memcpy_nd (%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<64xi32> + aiex.npu.dma_memcpy_nd (%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> aiex.npu.dma_wait { symbol = @objFifo_out0 } } } diff --git a/test/npu-xrt/add_one_two_txn/aie1.mlir b/test/npu-xrt/add_one_two_txn/aie1.mlir index 1a2492bd8b..6ff646e2e9 100644 --- a/test/npu-xrt/add_one_two_txn/aie1.mlir +++ b/test/npu-xrt/add_one_two_txn/aie1.mlir @@ -47,8 +47,8 @@ module { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<64xi32> - aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> + aiex.npu.dma_memcpy_nd (%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<64xi32> + aiex.npu.dma_memcpy_nd (%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> aiex.npu.dma_wait { symbol = @objFifo_out0 } } } diff --git a/test/npu-xrt/add_one_using_dma/aie.mlir b/test/npu-xrt/add_one_using_dma/aie.mlir index 28c3e8b49c..41f5d96e7f 100644 --- a/test/npu-xrt/add_one_using_dma/aie.mlir +++ b/test/npu-xrt/add_one_using_dma/aie.mlir @@ -79,8 +79,8 @@ module { %c0_i64 = arith.constant 0 : i64 %c1_i64 = arith.constant 1 : i64 %c64_i64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi32> + aiex.npu.dma_memcpy_nd(%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> + aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi32> aiex.npu.dma_wait {symbol = @objFifo_out0} } diff --git a/test/npu-xrt/cascade_flows/aie.mlir b/test/npu-xrt/cascade_flows/aie.mlir index fc1216421e..f276de84c8 100644 --- a/test/npu-xrt/cascade_flows/aie.mlir +++ b/test/npu-xrt/cascade_flows/aie.mlir @@ -63,8 +63,8 @@ module { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<64xi32> - aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> + aiex.npu.dma_memcpy_nd (%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<64xi32> + aiex.npu.dma_memcpy_nd (%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> aiex.npu.dma_wait { symbol = @objFifo_out0 } } } diff --git a/test/npu-xrt/column_specific/aie2.py b/test/npu-xrt/column_specific/aie2.py new file mode 100644 index 0000000000..885973431e --- /dev/null +++ b/test/npu-xrt/column_specific/aie2.py @@ -0,0 +1,58 @@ +# column_specific/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# +# REQUIRES: ryzen_ai, valid_xchess_license +# +# RUN: %python %S/aie2.py > ./aie2.mlir +# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags +# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir +# RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt + +import numpy as np +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.context import mlir_mod_ctx + +N = 4096 +dev = AIEDevice.npu1_3col +col = 2 +line_size = 1024 + + +def my_passthrough(): + with mlir_mod_ctx() as ctx: + + @device(dev) + def device_body(): + vector_ty = np.ndarray[(N,), np.dtype[np.int32]] + line_ty = np.ndarray[(line_size,), np.dtype[np.int32]] + + # Tile declarations + ShimTile = tile(col, 0) + ComputeTile2 = tile(col, 2) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile2, 2, line_ty) + of_out = object_fifo("out", ComputeTile2, ShimTile, 2, line_ty) + object_fifo_link(of_in, of_out) + + # To/from AIE-array data movement + @runtime_sequence(vector_ty, vector_ty, vector_ty) + def sequence(A, B, C): + npu_dma_memcpy_nd( + metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N], issue_token=True + ) + npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N]) + dma_wait(of_in, of_out) + + print(ctx.module) + + +my_passthrough() diff --git a/test/npu-xrt/column_specific/test.cpp b/test/npu-xrt/column_specific/test.cpp new file mode 100644 index 0000000000..3e227310cf --- /dev/null +++ b/test/npu-xrt/column_specific/test.cpp @@ -0,0 +1,195 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +namespace po = boost::program_options; + +void check_arg_file_exists(po::variables_map &vm_in, std::string name) { + if (!vm_in.count(name)) { + throw std::runtime_error("Error: no " + name + " file was provided\n"); + } else { + std::ifstream test(vm_in[name].as()); + if (!test) { + throw std::runtime_error("The " + name + " file " + + vm_in[name].as() + + " does not exist.\n"); + } + } +} + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +int main(int argc, const char *argv[]) { + // Program arguments parsing + po::options_description desc("Allowed options"); + desc.add_options()("help,h", "produce help message")( + "xclbin,x", po::value()->required(), + "the input xclbin path")( + "kernel,k", po::value()->required(), + "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( + "verbosity,v", po::value()->default_value(0), + "the verbosity of the output")( + "instr,i", po::value()->required(), + "path of file containing userspace instructions to be sent to the LX6")( + "length,l", po::value()->default_value(4096), + "the length of the transfer in int32_t"); + po::variables_map vm; + + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << std::endl; + return 1; + } + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + std::cerr << "Usage:\n" << desc << std::endl; + return 1; + } + + check_arg_file_exists(vm, "xclbin"); + check_arg_file_exists(vm, "instr"); + + std::vector instr_v = + load_instr_sequence(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << std::endl; + + int N = vm["length"].as(); + if ((N % 1024)) { + std::cerr << "Length must be a multiple of 1024." << std::endl; + return 1; + } + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() + << std::endl; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() + << std::endl; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + + device.register_xclbin(xclbin); + + // get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context." << std::endl; + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << std::endl; + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_inA = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(3)); + auto bo_inB = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(4)); + auto bo_out = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(5)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects." << std::endl; + + int32_t *bufInA = bo_inA.map(); + std::vector srcVecA; + for (int i = 0; i < N; i++) + srcVecA.push_back(i + 1); + memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t))); + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel." << std::endl; + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + run.wait(); + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + uint32_t *bufOut = bo_out.map(); + + int errors = 0; + + for (uint32_t i = 0; i < N; i++) { + uint32_t ref = (i + 1); + if (*(bufOut + i) != ref) { + errors++; + } + } + + if (!errors) { + std::cout << std::endl << "PASS!" << std::endl << std::endl; + return 0; + } else { + std::cout << std::endl + << errors << " mismatches." << std::endl + << std::endl; + std::cout << std::endl << "fail." << std::endl << std::endl; + return 1; + } +} \ No newline at end of file diff --git a/test/npu-xrt/ctrl_packet_reconfig/aie2.mlir b/test/npu-xrt/ctrl_packet_reconfig/aie2.mlir index 50ea11afae..06a1618018 100644 --- a/test/npu-xrt/ctrl_packet_reconfig/aie2.mlir +++ b/test/npu-xrt/ctrl_packet_reconfig/aie2.mlir @@ -73,8 +73,8 @@ module { %c56_i64 = arith.constant 56 : i64 %c61_i64 = arith.constant 61 : i64 %c64_i64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @objFifo_in0} : memref<64x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @objFifo_in0} : memref<64x64xi8> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> aiex.npu.dma_wait { symbol = @objFifo_out0 } } diff --git a/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/aie2.mlir b/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/aie2.mlir index 483dd9f8bb..651be72f45 100644 --- a/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/aie2.mlir +++ b/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/aie2.mlir @@ -290,8 +290,8 @@ module { %c4_i64 = arith.constant 4 : i64 %c4096_i64 = arith.constant 4096 : i64 %c64_i64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c4_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @objFifo_in0} : memref<4x64x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c4_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<4x64x64xi8> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c4_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @objFifo_in0} : memref<4x64x64xi8> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c4_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<4x64x64xi8> aiex.npu.dma_wait { symbol = @objFifo_out0 } } } diff --git a/test/npu-xrt/ctrl_packet_reconfig_4x1_cores/aie2.mlir b/test/npu-xrt/ctrl_packet_reconfig_4x1_cores/aie2.mlir index 374b846842..543ee3d1c6 100644 --- a/test/npu-xrt/ctrl_packet_reconfig_4x1_cores/aie2.mlir +++ b/test/npu-xrt/ctrl_packet_reconfig_4x1_cores/aie2.mlir @@ -255,15 +255,15 @@ module { %c4_i64 = arith.constant 4 : i64 %c4096_i64 = arith.constant 4096 : i64 %c64_i64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @shim_in_0} : memref<4x64x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c1_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64], packet = ) {id = 1 : i64, metadata = @shim_in_1} : memref<4x64x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c2_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64], packet = ) {id = 2 : i64, metadata = @shim_in_2} : memref<4x64x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c3_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64], packet = ) {id = 3 : i64, metadata = @shim_in_3} : memref<4x64x64xi8> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @shim_in_0} : memref<4x64x64xi8> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c1_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64], packet = ) {id = 1 : i64, metadata = @shim_in_1} : memref<4x64x64xi8> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c2_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64], packet = ) {id = 2 : i64, metadata = @shim_in_2} : memref<4x64x64xi8> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c3_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64], packet = ) {id = 3 : i64, metadata = @shim_in_3} : memref<4x64x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64]) {id = 4 : i64, metadata = @shim_out_0, issue_token = true} : memref<4x64x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c1_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64]) {id = 5 : i64, metadata = @shim_out_1, issue_token = true} : memref<4x64x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c2_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64]) {id = 6 : i64, metadata = @shim_out_2, issue_token = true} : memref<4x64x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c3_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64]) {id = 7 : i64, metadata = @shim_out_3, issue_token = true} : memref<4x64x64xi8> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64]) {id = 4 : i64, metadata = @shim_out_0, issue_token = true} : memref<4x64x64xi8> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c1_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64]) {id = 5 : i64, metadata = @shim_out_1, issue_token = true} : memref<4x64x64xi8> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c2_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64]) {id = 6 : i64, metadata = @shim_out_2, issue_token = true} : memref<4x64x64xi8> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c3_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64]) {id = 7 : i64, metadata = @shim_out_3, issue_token = true} : memref<4x64x64xi8> aiex.npu.dma_wait { symbol = @shim_out_0 } aiex.npu.dma_wait { symbol = @shim_out_1 } aiex.npu.dma_wait { symbol = @shim_out_2 } diff --git a/test/npu-xrt/dmabd_task_queue/aie.mlir b/test/npu-xrt/dmabd_task_queue/aie.mlir index 229be2561e..a02d71e2a5 100644 --- a/test/npu-xrt/dmabd_task_queue/aie.mlir +++ b/test/npu-xrt/dmabd_task_queue/aie.mlir @@ -184,10 +184,10 @@ module { aie.shim_dma_allocation @airMemcpyId5(MM2S, 0, 1) memref.global "public" @airMemcpyId5 : memref<1x48xbf16, 1 : i32> aiex.runtime_sequence @six(%arg0: memref<5xi32>, %arg1: memref<96xi32>, %arg2: memref<96xi32>, %arg3: memref<9xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 5][0, 0, 0, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<5xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 96][0, 0, 0, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<96xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 96][0, 0, 0, 1]) {id = 0 : i64, metadata = @airMemcpyId5} : memref<96xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 0][1, 1, 1, 9][0, 0, 0, 1]) {id = 0 : i64, metadata = @airMemcpyId12} : memref<9xi32> + aiex.npu.dma_memcpy_nd(%arg0[0, 0, 0, 0][1, 1, 1, 5][0, 0, 0, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<5xi32> + aiex.npu.dma_memcpy_nd(%arg1[0, 0, 0, 0][1, 1, 1, 96][0, 0, 0, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<96xi32> + aiex.npu.dma_memcpy_nd(%arg2[0, 0, 0, 0][1, 1, 1, 96][0, 0, 0, 1]) {id = 0 : i64, metadata = @airMemcpyId5} : memref<96xi32> + aiex.npu.dma_memcpy_nd(%arg3[0, 0, 0, 0][1, 1, 1, 9][0, 0, 0, 1]) {id = 0 : i64, metadata = @airMemcpyId12} : memref<9xi32> aiex.npu.sync {channel = 0 : i32, column = 2 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} } } {sym_name = "segment_0"} diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir index 87197925b1..4f32753ae1 100644 --- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir +++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir @@ -160,8 +160,8 @@ module { } {link_with = "kernel.o"} aie.shim_dma_allocation @input_fifo(MM2S, 0, 0) aiex.runtime_sequence(%arg0: memref<10xi32>, %arg1: memref<10xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 100][0, 0, 0, 1]) {id = 0 : i64, metadata = @input_fifo} : memref<10xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 100][0, 0, 0, 1]) {id = 2 : i64, metadata = @output_fifo} : memref<10xi32> + aiex.npu.dma_memcpy_nd(%arg0[0, 0, 0, 0][1, 1, 1, 100][0, 0, 0, 1]) {id = 0 : i64, metadata = @input_fifo} : memref<10xi32> + aiex.npu.dma_memcpy_nd(%arg1[0, 0, 0, 0][1, 1, 1, 100][0, 0, 0, 1]) {id = 2 : i64, metadata = @output_fifo} : memref<10xi32> aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} } aie.shim_dma_allocation @output_fifo(S2MM, 0, 0) diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir index a9a28b94f2..f3d4922eaf 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir @@ -638,9 +638,9 @@ module { memref.assume_alignment %arg0, 64 : memref<16x16xi32> memref.assume_alignment %arg1, 64 : memref<16x16xi32> memref.assume_alignment %arg2, 64 : memref<16x16xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12, issue_token = true} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd (%arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd (%arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12, issue_token = true} : memref<16x16xi32> aiex.npu.dma_wait {symbol = @airMemcpyId12} } } {sym_name = "segment_0"} diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir index c21ee5b652..55209abdc1 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir @@ -486,9 +486,9 @@ module { memref.assume_alignment %arg0, 64 : memref<16x16xi32> memref.assume_alignment %arg1, 64 : memref<16x16xi32> memref.assume_alignment %arg2, 64 : memref<16x16xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12, issue_token = true} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd (%arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd (%arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12, issue_token = true} : memref<16x16xi32> aiex.npu.dma_wait {symbol = @airMemcpyId12} } } {sym_name = "segment_0"} diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir index 0c8e533e80..412c411705 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir @@ -191,9 +191,9 @@ module { memref.assume_alignment %arg0, 64 : memref<16x16xi32> memref.assume_alignment %arg1, 64 : memref<16x16xi32> memref.assume_alignment %arg2, 64 : memref<16x16xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12, issue_token = true} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd (%arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd (%arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12, issue_token = true} : memref<16x16xi32> aiex.npu.dma_wait { symbol = @airMemcpyId12} } } {sym_name = "segment_0"} diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir index 005c72ec30..b36fd1e98b 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir @@ -522,9 +522,9 @@ module { memref.assume_alignment %arg0, 64 : memref<16x16xi32> memref.assume_alignment %arg1, 64 : memref<16x16xi32> memref.assume_alignment %arg2, 64 : memref<16x16xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12, issue_token = true} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd (%arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> + aiex.npu.dma_memcpy_nd (%arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12, issue_token = true} : memref<16x16xi32> aiex.npu.dma_wait {symbol = @airMemcpyId12} } } {sym_name = "segment_0"} diff --git a/test/npu-xrt/packet_flow/aie.mlir b/test/npu-xrt/packet_flow/aie.mlir index 45b804499c..f414cf6faa 100644 --- a/test/npu-xrt/packet_flow/aie.mlir +++ b/test/npu-xrt/packet_flow/aie.mlir @@ -74,8 +74,8 @@ module { %c56_i64 = arith.constant 56 : i64 %c61_i64 = arith.constant 61 : i64 %c64_i64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @objFifo_in0} : memref<64x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @objFifo_in0} : memref<64x64xi8> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> aiex.npu.dma_wait { symbol = @objFifo_out0 } } diff --git a/test/npu-xrt/packet_flow_fanin/aie.mlir b/test/npu-xrt/packet_flow_fanin/aie.mlir index 74fa08d229..f8ded10137 100644 --- a/test/npu-xrt/packet_flow_fanin/aie.mlir +++ b/test/npu-xrt/packet_flow_fanin/aie.mlir @@ -97,9 +97,9 @@ module { %c1_i64 = arith.constant 1 : i64 %c64_i64 = arith.constant 64 : i64 %c128_i64 = arith.constant 128 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @objFifo_in0} : memref<128x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c64_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 1 : i64, metadata = @objFifo_in1} : memref<128x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c128_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 2 : i64, metadata = @objFifo_out0, issue_token = true} : memref<128x64xi8> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @objFifo_in0} : memref<128x64xi8> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c0_i64, %c64_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 1 : i64, metadata = @objFifo_in1} : memref<128x64xi8> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c128_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 2 : i64, metadata = @objFifo_out0, issue_token = true} : memref<128x64xi8> aiex.npu.dma_wait { symbol = @objFifo_out0 } } diff --git a/test/npu-xrt/packet_flow_fanout/aie.mlir b/test/npu-xrt/packet_flow_fanout/aie.mlir index 33aa38580e..3d90acbef0 100644 --- a/test/npu-xrt/packet_flow_fanout/aie.mlir +++ b/test/npu-xrt/packet_flow_fanout/aie.mlir @@ -123,10 +123,10 @@ module { %c1_i64 = arith.constant 1 : i64 %c64_i64 = arith.constant 64 : i64 // Packet-flow fanout happening at shim dma channel @objFifo_in0, where packet id 3 and 7 go to tile_0_1's S2MM DMA channel 0 and 2, respectively - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @objFifo_in0} : memref<128x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c64_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 1 : i64, metadata = @objFifo_in0} : memref<128x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 2 : i64, metadata = @objFifo_out0, issue_token = true} : memref<128x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c64_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 3 : i64, metadata = @objFifo_out1, issue_token = true} : memref<128x64xi8> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @objFifo_in0} : memref<128x64xi8> + aiex.npu.dma_memcpy_nd (%arg0[%c0_i64, %c0_i64, %c64_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 1 : i64, metadata = @objFifo_in0} : memref<128x64xi8> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 2 : i64, metadata = @objFifo_out0, issue_token = true} : memref<128x64xi8> + aiex.npu.dma_memcpy_nd (%arg2[%c0_i64, %c0_i64, %c64_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 3 : i64, metadata = @objFifo_out1, issue_token = true} : memref<128x64xi8> aiex.npu.dma_wait { symbol = @objFifo_out0 } aiex.npu.dma_wait { symbol = @objFifo_out1 } } diff --git a/test/npu-xrt/two_col/aie.mlir b/test/npu-xrt/two_col/aie.mlir index 611dae93cf..6bfd28cdef 100644 --- a/test/npu-xrt/two_col/aie.mlir +++ b/test/npu-xrt/two_col/aie.mlir @@ -140,8 +140,8 @@ module { aiex.npu.rtp_write(@rtp1, 1, 0) aiex.npu.rtp_write(@rtp2, 1, 0) aiex.npu.rtp_write(@rtp3, 1, 0) - aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<2048xi32> - aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<2048xi32> + aiex.npu.dma_memcpy_nd (%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<2048xi32> + aiex.npu.dma_memcpy_nd (%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<2048xi32> aiex.npu.dma_wait {symbol = @objFifo_out0} } } diff --git a/test/npu-xrt/vec_vec_add_memtile_init/aie.mlir b/test/npu-xrt/vec_vec_add_memtile_init/aie.mlir index 326c53e3a4..3791a5af91 100644 --- a/test/npu-xrt/vec_vec_add_memtile_init/aie.mlir +++ b/test/npu-xrt/vec_vec_add_memtile_init/aie.mlir @@ -158,8 +158,8 @@ module { } aie.shim_dma_allocation @in1(MM2S, 0, 0) aiex.runtime_sequence(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<256xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 1 : i64, metadata = @in1} : memref<256xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 0 : i64, metadata = @out} : memref<256xi32> + aiex.npu.dma_memcpy_nd(%arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 1 : i64, metadata = @in1} : memref<256xi32> + aiex.npu.dma_memcpy_nd(%arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 0 : i64, metadata = @out} : memref<256xi32> aiex.npu.dma_wait {symbol = @out} } %mem_0_2 = aie.mem(%tile_0_2) { diff --git a/test/npu-xrt/vec_vec_add_tile_init/aie.mlir b/test/npu-xrt/vec_vec_add_tile_init/aie.mlir index d8c96e9b08..948d95b477 100644 --- a/test/npu-xrt/vec_vec_add_tile_init/aie.mlir +++ b/test/npu-xrt/vec_vec_add_tile_init/aie.mlir @@ -141,8 +141,8 @@ module { } aie.shim_dma_allocation @in1(MM2S, 0, 0) aiex.runtime_sequence(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<256xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 1 : i64, metadata = @in1} : memref<256xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 0 : i64, metadata = @out} : memref<256xi32> + aiex.npu.dma_memcpy_nd(%arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 1 : i64, metadata = @in1} : memref<256xi32> + aiex.npu.dma_memcpy_nd(%arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0, 1]) {id = 0 : i64, metadata = @out} : memref<256xi32> aiex.npu.dma_wait {symbol = @out} } %mem_0_2 = aie.mem(%tile_0_2) { diff --git a/test/npu-xrt/vector_scalar_using_dma/aie.mlir b/test/npu-xrt/vector_scalar_using_dma/aie.mlir index a390823633..2901bc950c 100644 --- a/test/npu-xrt/vector_scalar_using_dma/aie.mlir +++ b/test/npu-xrt/vector_scalar_using_dma/aie.mlir @@ -69,8 +69,8 @@ module { %c0_i64 = arith.constant 0 : i64 %c1_i64 = arith.constant 1 : i64 %c4096_i64 = arith.constant 4096 : i64 - aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @out, issue_token = true} : memref<4096xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @in} : memref<4096xi32> + aiex.npu.dma_memcpy_nd(%arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @out, issue_token = true} : memref<4096xi32> + aiex.npu.dma_memcpy_nd(%arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @in} : memref<4096xi32> aiex.npu.dma_wait { symbol = @out } } diff --git a/test/objectFifo-stateful-transform/memtile_padding_test.mlir b/test/objectFifo-stateful-transform/memtile_padding_test.mlir index 958c4088d7..422cc2d21f 100644 --- a/test/objectFifo-stateful-transform/memtile_padding_test.mlir +++ b/test/objectFifo-stateful-transform/memtile_padding_test.mlir @@ -55,8 +55,8 @@ // CHECK: aie.end // CHECK: } // CHECK: aiex.runtime_sequence(%arg0: memref<61x56xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) { - // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 61, 56][0, 0, 56, 1]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8> - // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 64, 64][0, 0, 64, 1]) {id = 1 : i64, issue_token = true, metadata = @objFifo_out0} : memref<64x64xi8> + // CHECK: aiex.npu.dma_memcpy_nd(%arg0[0, 0, 0, 0][1, 1, 61, 56][0, 0, 56, 1]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8> + // CHECK: aiex.npu.dma_memcpy_nd(%arg2[0, 0, 0, 0][1, 1, 64, 64][0, 0, 64, 1]) {id = 1 : i64, issue_token = true, metadata = @objFifo_out0} : memref<64x64xi8> // CHECK: aiex.npu.dma_wait {symbol = @objFifo_out0} // CHECK: } // CHECK: aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0) @@ -174,8 +174,8 @@ module { } aiex.runtime_sequence(%arg0: memref<61x56xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) { - aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 61, 56][0, 0, 56, 1]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 64, 64][0, 0, 64, 1]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> + aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 61, 56][0, 0, 56, 1]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8> + aiex.npu.dma_memcpy_nd (%arg2[0, 0, 0, 0][1, 1, 64, 64][0, 0, 64, 1]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> aiex.npu.dma_wait { symbol = @objFifo_out0 } } }