Xilinx · pvasireddy-amd · Dec 12, 2024 · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024
@@ -494,9 +494,9 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
   let description = [{
     An n-dimensional half DMA operator.
 
-    Programs a DMA on coordinates (`x`, `y`) to access a memory `memref` with an access
-    pattern specified by `offsets`, `sizes` and `strides` or `static_offsets`, `static_sizes`
-    and `static_strides`. The operator references the target channel through the `metadata`
+    Programs a DMA to access a memory `memref` with an access pattern specified by `offsets`,
+    `sizes` and `strides` or `static_offsets`, `static_sizes` and `static_strides`. The operator
+    references the target DMA coordinates (`x`, `y`) and channel through the `metadata`
     symbol and specifies a descriptor `id` to be used, which will become the `bd_id` to be used
     when lowered further. The `issue_token` attribute specifies whether the execution of this
     operation should issue a token which can be received and read for synchronization purposes.
@@ -557,9 +557,7 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
   }];
 
   let arguments = (
-    ins I64Attr:$x,
-        I64Attr:$y,
-        AnyMemRef:$memref,
+    ins AnyMemRef:$memref,
         // NOTE: these are in reverse order: offset3, offset2, ...
         Variadic<I64>:$offsets,
         Variadic<I64>:$sizes,
@@ -580,7 +578,7 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
   );
 
   let assemblyFormat = [{
-    `(` $x `,` $y `,` $memref ``
+    `(` $memref ``
     custom<DynamicIndexList>($offsets, $static_offsets) ``
     custom<DynamicIndexList>($sizes, $static_sizes) ``
     custom<DynamicIndexList>($strides, $static_strides) ``

@@ -370,10 +370,6 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
       llvm::map_to_vector(llvm::reverse(getMixedStrides()), [](OpFoldResult s) {
         return getConstantIntValue(s).value();
       });
-  llvm::SmallVector<int64_t, 4> hardwareSizes(4);
-  llvm::SmallVector<int64_t, 4> hardwareStrides(4);
-  getHardwareStridesWraps(targetModel, buffer, inputSizes, inputStrides,
-                          hardwareSizes, hardwareStrides);
   int64_t offset = getOffsetInBytes();
 
   // The experimental HSA target uses this op on AIE1, skip all the AIE2
@@ -385,19 +381,6 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
     return emitOpError("Offset must be 4-byte-aligned.");
   }
 
-  // dma_memcpy_nd transfers of the form [1, 1, 1, len][0, 0, 0, 1] do not
-  // specify any data layout transformation, but simply express a contiguous
-  // transfer of `len`. For backwards compatibility, we allow this to proceed
-  // even if it exceeds the maximum stride/wrap size of any one dimension,
-  // and simply do not lower any data layout transformations, since there is
-  // no other way to express this at the dma_memcpy_nd interface otherwise.
-  bool skipTransformationChecks = isLinearTransferWithoutTransformation();
-  if (failed(verifyStridesWraps(*this, buffer, getX(), getY(), inputSizes,
-                                inputStrides, hardwareSizes, hardwareStrides,
-                                skipTransformationChecks))) {
-    return failure();
-  }
-
   // packet header
   if (auto packetInfo = getPacket()) {
     if (packetInfo->getPktType() > 7)

@@ -125,7 +125,7 @@ struct AIECtrlPacketToDmaPass : AIECtrlPacketToDmaBase<AIECtrlPacketToDmaPass> {
 
               StringRef metadata = builder.getStringAttr(shimDmaAllocName);
               builder.create<NpuDmaMemcpyNdOp>(
-                  builder.getUnknownLoc(), 0, 0, newBlockArg,
+                  builder.getUnknownLoc(), newBlockArg, SmallVector<Value>{},
                   SmallVector<Value>{}, SmallVector<Value>{},
                   SmallVector<Value>{}, ArrayRef(staticOffsets),
                   ArrayRef(staticSizes), ArrayRef(staticStrides),

@@ -359,6 +359,19 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     // row
     row = IntegerAttr::get(i32ty, 0);
 
+    // dma_memcpy_nd transfers of the form [1, 1, 1, len][0, 0, 0, 1] do not
+    // specify any data layout transformation, but simply express a contiguous
+    // transfer of `len`. For backwards compatibility, we allow this to proceed
+    // even if it exceeds the maximum stride/wrap size of any one dimension,
+    // and simply do not lower any data layout transformations, since there is
+    // no other way to express this at the dma_memcpy_nd interface otherwise.
+    bool skipTransformationChecks = op.isLinearTransferWithoutTransformation();
+    if (failed(verifyStridesWraps(op, bufferType, col, 0, inputSizes,
+                                  inputStrides, sizes, strides,
+                                  skipTransformationChecks))) {
+      return failure();
+    }
+
     // arg_idx
     AIEX::RuntimeSequenceOp seq_op =
         op->getParentOfType<AIEX::RuntimeSequenceOp>();

@@ -63,8 +63,6 @@ def __init__(
         strides: MixedValues | None = None,
         issue_token: bool | None = None,
     ):
-        x = 0
-        y = 0
         if tap and not (offsets is None and sizes is None and strides is None):
             raise ValueError(
                 "NpuDmaMemcpyNd can take either a TileAccessPattern OR (sizes and/or strides and/or offsets), but not both."
@@ -92,8 +90,6 @@ def __init__(
         if isinstance(metadata, ObjectFifoCreateOp):
             metadata = metadata.sym_name.value
         super().__init__(
-            x,
-            y,
             mem,
             dynamic_offsets,
             dynamic_sizes,

@@ -24,8 +24,8 @@ module {
       %c8 = arith.constant 8 : i64
       %c16 = arith.constant 16 : i64
       %c32 = arith.constant 32 : i64
-      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0, %c1]) { metadata = @of_toMem, id = 1 : i64, issue_token = true } : memref<64xi32>
-      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8, %c1]) { metadata = @of_fromMem, id = 0 : i64, issue_token = false } : memref<4x2x8xi32>
+      aiex.npu.dma_memcpy_nd (%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0, %c1]) { metadata = @of_toMem, id = 1 : i64, issue_token = true } : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (%in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8, %c1]) { metadata = @of_fromMem, id = 0 : i64, issue_token = false } : memref<4x2x8xi32>
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
     aie.shim_dma_allocation @of_toMem (S2MM, 0, 0)

@@ -19,7 +19,7 @@ module @shimDmaMemcpy{
   aie.device(xcve2302) {
     memref.global "public" @toMem : memref<1xbf16>
     aiex.runtime_sequence(%arg0: memref<1xbf16>, %arg1: memref<1xbf16>, %arg2: memref<1xbf16>) {
-      aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<1xbf16>
+      aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<1xbf16>
       aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
     }
     aie.shim_dma_allocation @toMem (S2MM, 0, 0)

@@ -19,7 +19,7 @@ module @shimDmaMemcpy{
   aie.device(xcve2302) {
     memref.global "public" @toMem : memref<65536xi64>
     aiex.runtime_sequence(%arg0: memref<65536xi64>, %arg1: memref<65536xi64>, %arg2: memref<65536xi64>) {
-      aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<65536xi64>
+      aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<65536xi64>
       aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
     }
     aie.shim_dma_allocation @toMem (S2MM, 0, 0)

@@ -23,8 +23,8 @@ module  {
     memref.global "public" @toMem : memref<16xi32>
     memref.global "public" @fromMem : memref<16xi32>
     aiex.runtime_sequence(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
-      aiex.npu.dma_memcpy_nd (0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32>
+      aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
+      aiex.npu.dma_memcpy_nd (%arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32>
     }
     aie.shim_dma_allocation @fromMem (MM2S, 0, 0)
     aie.shim_dma_allocation @toMem (S2MM, 0, 0)

@@ -26,8 +26,8 @@ module  {
     memref.global "public" @toMem : memref<16xi32>
     memref.global "public" @fromMem : memref<16xi32>
     aiex.runtime_sequence(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-        aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64, issue_token = true } : memref<16xi32>
-        aiex.npu.dma_memcpy_nd (0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @fromMem, id = 0 : i64, issue_token = false } : memref<16xi32>
+        aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64, issue_token = true } : memref<16xi32>
+        aiex.npu.dma_memcpy_nd (%arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @fromMem, id = 0 : i64, issue_token = false } : memref<16xi32>
     }
     aie.shim_dma_allocation @fromMem (MM2S, 0, 0)
     aie.shim_dma_allocation @toMem (S2MM, 0, 0)

@@ -30,7 +30,7 @@ module @shimDmaMemcpy{
   aie.device(xcve2302) {
     memref.global "public" @toMem : memref<65536xbf16>
     aiex.runtime_sequence(%arg0: memref<65536xbf16>, %arg1: memref<65536xbf16>, %arg2: memref<65536xbf16>) {
-      aiex.npu.dma_memcpy_nd (2, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<65536xbf16>
+      aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<65536xbf16>
       aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
     }
     aie.shim_dma_allocation @toMem (S2MM, 0, 2)

@@ -55,8 +55,8 @@ module {
     aie.shim_dma_allocation @out0(S2MM, 0, 6)
 
     aiex.runtime_sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
-      aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 0 : i64, metadata = @out0} : memref<64xi32>
-      aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 1 : i64, metadata = @in0} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (%arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 0 : i64, metadata = @out0} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (%arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 1 : i64, metadata = @in0} : memref<64xi32>
       aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
     }
   }

@@ -24,7 +24,7 @@ module {
   aie.shim_dma_allocation @airMemcpyId12(MM2S, 0, 0)
   memref.global "public" @airMemcpyId12 : memref<1x2x1x32x32xi32, 1 : i32>
   aiex.runtime_sequence (%arg0: memref<2x64x64xi32>, %arg1: memref<2x64x64xi32>, %arg2: memref<2x64x64xi32>) {
-    aiex.npu.dma_memcpy_nd(0, 0, %arg0[1, 0, 0, 0][1, 2, 32, 32][4096, 2048, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId12} : memref<2x64x64xi32>
+    aiex.npu.dma_memcpy_nd(%arg0[1, 0, 0, 0][1, 2, 32, 32][4096, 2048, 64, 1]) {id = 0 : i64, metadata = @airMemcpyId12} : memref<2x64x64xi32>
   }
  }
 }
@@ -19,7 +19,7 @@ module {
       %c1920 = arith.constant 1920 : i64
       %c1080 = arith.constant 1080 : i64
       // expected-error@+1 {{Size 0 exceeds the [0:1023] range}}
-      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi32>
+      aiex.npu.dma_memcpy_nd (%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi32>
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
   }
@@ -39,7 +39,7 @@ module {
       %c32 = arith.constant 32 : i64
       %c128 = arith.constant 128 : i64
       // expected-error@+1 {{Size 3 exceeds the [1:64] range}}
-      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c128,%c2,%c2,%c8][%c0,%c16,%c8,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<128x4x2x8xi32>
+      aiex.npu.dma_memcpy_nd (%in[%c0,%c0,%c0,%c0][%c128,%c2,%c2,%c8][%c0,%c16,%c8,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<128x4x2x8xi32>
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
   }
@@ -55,7 +55,7 @@ module {
       %c2 = arith.constant 2 : i64
       %c2097152 = arith.constant 2097152 : i64
       // expected-error@+1 {{Stride 1 exceeds the [1:1048576] range}}
-      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2][%c0,%c0,%c2097152,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<8388608xi32>
+      aiex.npu.dma_memcpy_nd (%in[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2][%c0,%c0,%c2097152,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<8388608xi32>
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
   }
@@ -73,7 +73,7 @@ module {
       %c2 = arith.constant 2 : i64
       %c8 = arith.constant 8 : i64
       // expected-error@+1 {{Offset must be 4-byte-aligned}}
-      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c1][%c1,%c1,%c1,%c8][%c0,%c0,%c1,%c1]) { metadata = @fifo, id = 0 : i64 } : memref<8xi8>
+      aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c1][%c1,%c1,%c1,%c8][%c0,%c0,%c1,%c1]) { metadata = @fifo, id = 0 : i64 } : memref<8xi8>
     }
     aie.shim_dma_allocation @fifo (MM2S, 0, 0)
   }
@@ -95,7 +95,7 @@ module {
       %c2048 = arith.constant 2048 : i64
       // Although 2048 exceeds the 0:1023 limit for size 0, since the elements are i8s,
       // this should be a size of 512 in address granularity (4 bytes) and hence pass the test.
-      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
+      aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -113,7 +113,7 @@ module {
       %c8 = arith.constant 8 : i64
       %c2048 = arith.constant 2048 : i64
       // expected-error@+1 {{Size 0 exceeds the [0:1023] range}}
-      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16>
+      aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16>
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -132,7 +132,7 @@ module {
       %c2 = arith.constant 2 : i64  // Stride of 2 i8s = 2 bytes < 4 byte granularity, should not be possible
       %c8 = arith.constant 8 : i64
       // expected-error@+1 {{Stride 1 is 2 elements * 1 bytes = 2 bytes, which is not divisible by 4}}
-      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c2,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
+      aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c2,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -149,7 +149,7 @@ module {
       %c4 = arith.constant 4 : i64
       %c8 = arith.constant 8 : i64
       // expected-error@+1 {{2 elements at 1 bytes each equal 2 bytes, which is not divisible by 4}}
-      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
+      aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -168,7 +168,7 @@ module {
       %c4 = arith.constant 4 : i64
       %c8 = arith.constant 8 : i64
       // expected-error@+1 {{Stride 0 is 2 elements * 1 bytes = 2 bytes, which is not divisible by 4}}
-      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c0,%c2]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
+      aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c0,%c2]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -186,7 +186,7 @@ module {
       %c3 = arith.constant 3 : i64
       %c8 = arith.constant 8 : i64
       // expected-error@+1 {{3 elements at 2 bytes each equal 6 bytes, which is not divisible by 4}}
-      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c3][%c0,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16>
+      aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c3][%c0,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16>
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -204,7 +204,7 @@ module {
       %c4 = arith.constant 4 : i64
       %c8 = arith.constant 8 : i64
       // expected-error@+1 {{Unsupported tile type at (0, 0) Must be ShimNOC, Mem or Core.}}
-      aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c4][%c0,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16>
+      aiex.npu.dma_memcpy_nd (%a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c4][%c0,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16>
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -223,9 +223,9 @@ module {
       %c3 = arith.constant 3 : i64
       %c8 = arith.constant 8 : i64
       %c1572864 = arith.constant 1572864 : i64
-      aiex.npu.dma_memcpy_nd (0, 0, %a[%c1,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c1572864,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi32>
+      aiex.npu.dma_memcpy_nd (%a[%c1,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c1572864,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi32>
       // expected-error@+1 {{Stride 3 exceeds the [1:1048576] range.}}
-      aiex.npu.dma_memcpy_nd (0, 0, %a[%c1,%c0,%c0,%c0][%c2,%c1,%c1,%c2][%c1572864,%c0,%c0,%c1]) { metadata = @objectfifo, id = 1 : i64 } : memref<8xi32>
+      aiex.npu.dma_memcpy_nd (%a[%c1,%c0,%c0,%c0][%c2,%c1,%c1,%c2][%c1572864,%c0,%c0,%c1]) { metadata = @objectfifo, id = 1 : i64 } : memref<8xi32>
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -243,9 +243,9 @@ module {
       %c2 = arith.constant 2 : i64
       %c3 = arith.constant 3 : i64
       %c8 = arith.constant 8 : i64
-      aiex.npu.dma_memcpy_nd (0, 0, %a[%c1,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c0,%c0,%c0,%c1], packet = <pkt_id = 31, pkt_type = 7>) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi32>
+      aiex.npu.dma_memcpy_nd (%a[%c1,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c0,%c0,%c0,%c1], packet = <pkt_id = 31, pkt_type = 7>) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi32>
       // expected-error@+1 {{Packet ID field can only hold 5 bits.}}
-      aiex.npu.dma_memcpy_nd (0, 0, %a[%c1,%c0,%c0,%c0][%c2,%c1,%c1,%c2][%c0,%c0,%c0,%c1], packet = <pkt_id = 32, pkt_type = 2>) { metadata = @objectfifo, id = 1 : i64 } : memref<8xi32>
+      aiex.npu.dma_memcpy_nd (%a[%c1,%c0,%c0,%c0][%c2,%c1,%c1,%c2][%c0,%c0,%c0,%c1], packet = <pkt_id = 32, pkt_type = 2>) { metadata = @objectfifo, id = 1 : i64 } : memref<8xi32>
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -264,7 +264,7 @@ module {
       %c3 = arith.constant 3 : i64
       %c8 = arith.constant 8 : i64
       // expected-error@+1 {{Packet type field can only hold 3 bits.}}
-      aiex.npu.dma_memcpy_nd (0, 0, %a[%c1,%c0,%c0,%c0][%c2,%c1,%c1,%c2][%c0,%c0,%c0,%c1], packet = <pkt_id = 2, pkt_type = 8>) { metadata = @objectfifo, id = 1 : i64 } : memref<8xi32>
+      aiex.npu.dma_memcpy_nd (%a[%c1,%c0,%c0,%c0][%c2,%c1,%c1,%c2][%c0,%c0,%c0,%c1], packet = <pkt_id = 2, pkt_type = 8>) { metadata = @objectfifo, id = 1 : i64 } : memref<8xi32>
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }