diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index 564d011e36..7ef7669a50 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -419,6 +419,46 @@ struct DmaWaitToNpuPattern : OpConversionPattern { } }; +std::optional +getAllocOpForSymbol(SmallVector shimDmaAllocOps, + StringRef sym_name) { + for (auto shimDmaAllocOp : shimDmaAllocOps) + if (shimDmaAllocOp.getSymName() == sym_name) + return shimDmaAllocOp; + return std::nullopt; +} + +void insertNpuSyncOpForResults(AIE::DeviceOp device) { + SmallVector shimDmaAllocOps; + device.walk([&](AIE::ShimDMAAllocationOp shimDmaAllocOp) { + shimDmaAllocOps.push_back(shimDmaAllocOp); + }); + device.walk([&](mlir::func::FuncOp f) { + SmallVector dmas; + Operation *returnOp = nullptr; + f.walk([&](mlir::func::ReturnOp op) { returnOp = op.getOperation(); }); + f.walk([&](AIEX::NpuDmaMemcpyNdOp dma) { dmas.push_back(dma); }); + for (auto dma : dmas) { + if (auto infoOp = + getAllocOpForSymbol(shimDmaAllocOps, dma.getMetadata())) { + if (infoOp->getChannelDir() == AIE::DMAChannelDir::S2MM) { + // Found dma op copying results to host + OpBuilder builder(dma); + auto col = builder.getI32IntegerAttr(infoOp->getCol()); + auto row = builder.getI32IntegerAttr(0); + auto dir = builder.getI32IntegerAttr(0); + auto chan = builder.getI32IntegerAttr(infoOp->getChannelIndex()); + auto col_num = builder.getI32IntegerAttr(1); + auto row_num = builder.getI32IntegerAttr(1); + builder.setInsertionPoint(returnOp); + builder.create(dma->getLoc(), col, row, dir, chan, + col_num, row_num); + } + } + } + }); +} + struct AIEDmaToNpuPass : AIEDmaToNpuBase { void runOnOperation() override { @@ -441,6 +481,9 @@ struct AIEDmaToNpuPass : AIEDmaToNpuBase { patterns.insert(&getContext(), cachingGetter); patterns.insert(&getContext()); + // Insert sync op after copying data out to host + insertNpuSyncOpForResults(device); + if (failed(applyPartialConversion(device, target, std::move(patterns)))) signalPassFailure(); } diff --git a/programming_examples/basic/dma_transpose/aie2.py b/programming_examples/basic/dma_transpose/aie2.py index 7af771d5b7..495a68975d 100644 --- a/programming_examples/basic/dma_transpose/aie2.py +++ b/programming_examples/basic/dma_transpose/aie2.py @@ -59,7 +59,6 @@ def sequence(A, B, C): npu_dma_memcpy_nd( metadata="in", bd_id=1, mem=A, sizes=[1, K, M, 1], strides=[1, 1, K] ) - npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py index 6b27d9f9e3..08658c9b4f 100644 --- a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py +++ b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py @@ -204,9 +204,6 @@ def sequence(A, B, C): strides=[0, 0, 0], ) - for i in range(n_cores): - npu_sync(column=i, row=0, direction=0, channel=0) - print(ctx.module) diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index 909fba0c43..9d78aba889 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -227,8 +227,6 @@ def sequence(A, B, C): strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s], ) - npu_sync(column=0, row=0, direction=0, channel=0) - print(ctx.module) diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index 76453f4b94..548c1f2d98 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -366,8 +366,6 @@ def sequence(A, B, C): sizes=[N_div_n_div_n_cols, K_div_k, k, n_in_i32s], strides=[n_x_n_cols_in_i32s, k_x_N_in_i32s, N_in_i32s], ) - for i in range(n_cols): - npu_sync(column=i, row=0, direction=0, channel=0) # print(ctx.module.operation.verify()) print(ctx.module) diff --git a/programming_examples/basic/matrix_scalar_add/aie2.py b/programming_examples/basic/matrix_scalar_add/aie2.py index 52a71688fc..a774dd076d 100644 --- a/programming_examples/basic/matrix_scalar_add/aie2.py +++ b/programming_examples/basic/matrix_scalar_add/aie2.py @@ -94,7 +94,6 @@ def sequence(inTensor, notUsed, outTensor): sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH], strides=[1, 1, IMAGE_WIDTH], ) - npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/programming_examples/basic/passthrough_dmas/aie2.py b/programming_examples/basic/passthrough_dmas/aie2.py index 10becd4e27..413c9c27be 100644 --- a/programming_examples/basic/passthrough_dmas/aie2.py +++ b/programming_examples/basic/passthrough_dmas/aie2.py @@ -64,7 +64,6 @@ def core_body(): def sequence(A, B, C): npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py index 4fe9a7ed9b..759a205baa 100644 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -85,7 +85,6 @@ def sequence(inTensor, outTensor, notUsed): mem=outTensor, sizes=[1, 1, 1, tensorSizeInInt32s], ) - npu_sync(column=0, row=0, direction=0, channel=0) try: diff --git a/programming_examples/basic/vector_exp/aie2.py b/programming_examples/basic/vector_exp/aie2.py index af58a6392b..99d26ef316 100644 --- a/programming_examples/basic/vector_exp/aie2.py +++ b/programming_examples/basic/vector_exp/aie2.py @@ -113,7 +113,6 @@ def sequence(A, C): npu_dma_memcpy_nd( metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] ) - npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/programming_examples/basic/vector_reduce_add/aie2.py b/programming_examples/basic/vector_reduce_add/aie2.py index fe6f049984..564d875a9a 100644 --- a/programming_examples/basic/vector_reduce_add/aie2.py +++ b/programming_examples/basic/vector_reduce_add/aie2.py @@ -70,7 +70,6 @@ def core_body(): def sequence(A, C): npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/programming_examples/basic/vector_reduce_max/aie2.py b/programming_examples/basic/vector_reduce_max/aie2.py index 31ee9f181a..2c55132579 100644 --- a/programming_examples/basic/vector_reduce_max/aie2.py +++ b/programming_examples/basic/vector_reduce_max/aie2.py @@ -70,7 +70,6 @@ def core_body(): def sequence(A, C): npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/programming_examples/basic/vector_reduce_min/aie2.py b/programming_examples/basic/vector_reduce_min/aie2.py index 430ad5f9ef..8ae8a92af5 100644 --- a/programming_examples/basic/vector_reduce_min/aie2.py +++ b/programming_examples/basic/vector_reduce_min/aie2.py @@ -70,7 +70,6 @@ def core_body(): def sequence(A, C): npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/programming_examples/basic/vector_scalar_add/aie2.py b/programming_examples/basic/vector_scalar_add/aie2.py index 46b44308b6..8562fc5745 100644 --- a/programming_examples/basic/vector_scalar_add/aie2.py +++ b/programming_examples/basic/vector_scalar_add/aie2.py @@ -71,7 +71,6 @@ def sequence(inTensor, outTensor): npu_dma_memcpy_nd( metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, PROBLEM_SIZE] ) - npu_sync(column=0, row=0, direction=0, channel=0) # Declares that subsequent code is in mlir-aie context diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py index 8d367ced50..82126016aa 100644 --- a/programming_examples/basic/vector_scalar_mul/aie2.py +++ b/programming_examples/basic/vector_scalar_mul/aie2.py @@ -101,7 +101,6 @@ def sequence(A, F, C): ) npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N_in_i32s]) npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1]) - npu_sync(column=0, row=0, direction=0, channel=0) try: diff --git a/programming_examples/basic/vector_vector_add/aie2.py b/programming_examples/basic/vector_vector_add/aie2.py index 62ad20534c..2e0fdbfe22 100644 --- a/programming_examples/basic/vector_vector_add/aie2.py +++ b/programming_examples/basic/vector_vector_add/aie2.py @@ -81,7 +81,6 @@ def sequence(A, B, C): npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N]) npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N]) - npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/programming_examples/basic/vector_vector_modulo/aie2.py b/programming_examples/basic/vector_vector_modulo/aie2.py index 83d5675e85..a21bb87c16 100644 --- a/programming_examples/basic/vector_vector_modulo/aie2.py +++ b/programming_examples/basic/vector_vector_modulo/aie2.py @@ -81,7 +81,6 @@ def sequence(A, B, C): npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N]) npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N]) - npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/programming_examples/basic/vector_vector_mul/aie2.py b/programming_examples/basic/vector_vector_mul/aie2.py index fa07bbe58a..4b20bdfaa2 100644 --- a/programming_examples/basic/vector_vector_mul/aie2.py +++ b/programming_examples/basic/vector_vector_mul/aie2.py @@ -81,7 +81,6 @@ def sequence(A, B, C): npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N]) npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N]) - npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/programming_examples/ml/bottleneck/aie2.py b/programming_examples/ml/bottleneck/aie2.py index 669aacb415..b4f7620d34 100644 --- a/programming_examples/ml/bottleneck/aie2.py +++ b/programming_examples/ml/bottleneck/aie2.py @@ -631,8 +631,6 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): sizes=[1, 1, 1, totalWeightsSize32b], ) - npu_sync(column=0, row=0, direction=0, channel=0) - print(ctx.module) diff --git a/programming_examples/ml/conv2d/aie2.py b/programming_examples/ml/conv2d/aie2.py index 11e92f55c2..189696067b 100644 --- a/programming_examples/ml/conv2d/aie2.py +++ b/programming_examples/ml/conv2d/aie2.py @@ -168,7 +168,6 @@ def sequence(I, W, O): mem=W, sizes=[1, 1, 1, weightsInInt32s], ) - npu_sync(column=0, row=0, direction=0, channel=0) # print(ctx.module.operation.verify()) print(ctx.module) diff --git a/programming_examples/ml/conv2d_fused_relu/aie2.py b/programming_examples/ml/conv2d_fused_relu/aie2.py index faafaf4d86..d0e591ad1c 100644 --- a/programming_examples/ml/conv2d_fused_relu/aie2.py +++ b/programming_examples/ml/conv2d_fused_relu/aie2.py @@ -254,7 +254,6 @@ def sequence(I, W, O): mem=W, sizes=[1, 1, 1, weightsInInt32s], ) - npu_sync(column=0, row=0, direction=0, channel=0) # print(ctx.module.operation.verify()) print(ctx.module) diff --git a/programming_examples/ml/eltwise_add/aie2.py b/programming_examples/ml/eltwise_add/aie2.py index 354e9f78d1..936da3f2bf 100644 --- a/programming_examples/ml/eltwise_add/aie2.py +++ b/programming_examples/ml/eltwise_add/aie2.py @@ -152,7 +152,6 @@ def sequence(A, B, C): npu_dma_memcpy_nd( metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s] ) - npu_sync(column=0, row=0, direction=0, channel=0) try: diff --git a/programming_examples/ml/eltwise_mul/aie2.py b/programming_examples/ml/eltwise_mul/aie2.py index 5808d0c998..0f584ef38d 100644 --- a/programming_examples/ml/eltwise_mul/aie2.py +++ b/programming_examples/ml/eltwise_mul/aie2.py @@ -153,7 +153,6 @@ def sequence(A, B, C): npu_dma_memcpy_nd( metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s] ) - npu_sync(column=0, row=0, direction=0, channel=0) try: diff --git a/programming_examples/ml/relu/aie2.py b/programming_examples/ml/relu/aie2.py index e4da4eafdf..a55f7cb179 100644 --- a/programming_examples/ml/relu/aie2.py +++ b/programming_examples/ml/relu/aie2.py @@ -124,7 +124,6 @@ def sequence(A, C): npu_dma_memcpy_nd( metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] ) - npu_sync(column=0, row=0, direction=0, channel=0) try: diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py index 94f5888512..150128d887 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py +++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py @@ -986,8 +986,6 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): sizes=[1, 1, 1, totalWeightsSize32b_rest], ) - npu_sync(column=1, row=0, direction=0, channel=0) - res = ctx.module.operation.verify() if res == True: print(ctx.module) diff --git a/programming_examples/ml/softmax/aie2.py b/programming_examples/ml/softmax/aie2.py index 47d60adf6a..269696f4e6 100755 --- a/programming_examples/ml/softmax/aie2.py +++ b/programming_examples/ml/softmax/aie2.py @@ -128,7 +128,6 @@ def sequence(A, C): npu_dma_memcpy_nd( metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] ) - npu_sync(column=0, row=0, direction=0, channel=0) try: diff --git a/programming_examples/vision/color_detect/aie2_colorDetect.py b/programming_examples/vision/color_detect/aie2_colorDetect.py index 19e4e04ca9..736ef2a6b3 100644 --- a/programming_examples/vision/color_detect/aie2_colorDetect.py +++ b/programming_examples/vision/color_detect/aie2_colorDetect.py @@ -266,7 +266,6 @@ def sequence(I, B, O): mem=O, sizes=[1, 1, 1, height * lineWidthInInt32s], ) - npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py index 1215a4ddd0..7766cee3c5 100644 --- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py +++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py @@ -284,7 +284,6 @@ def sequence(inTensor, notUsed, outTensor): mem=outTensor, sizes=[1, 1, 1, tensorSizeInInt32s], ) - npu_sync(column=0, row=0, direction=0, channel=0) # print(ctx.module.operation.verify()) print(ctx.module) diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect.py b/programming_examples/vision/edge_detect/aie2_edgeDetect.py index 1af069d94e..c927cdd8c0 100644 --- a/programming_examples/vision/edge_detect/aie2_edgeDetect.py +++ b/programming_examples/vision/edge_detect/aie2_edgeDetect.py @@ -312,7 +312,6 @@ def sequence(I, B, O): mem=I, sizes=[1, 1, 1, tensorSizeInInt32s], ) - npu_sync(column=0, row=0, direction=0, channel=0) # print(ctx.module.operation.verify()) print(ctx.module) diff --git a/programming_examples/vision/vision_passthrough/aie2.py b/programming_examples/vision/vision_passthrough/aie2.py index 8d568af388..dad0ee09ff 100644 --- a/programming_examples/vision/vision_passthrough/aie2.py +++ b/programming_examples/vision/vision_passthrough/aie2.py @@ -165,7 +165,6 @@ def sequence(inTensor, notUsed, outTensor): mem=outTensor, sizes=[1, 1, 1, tensorSizeInInt32s], ) - npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir index 0621e0b622..13c36fbf9e 100644 --- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir +++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir @@ -55,7 +55,6 @@ module @passThroughLine_aie2 { //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words]) aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<518400xi32> aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<518400xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir index c2c31b0d9b..67efe2e747 100644 --- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir +++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir @@ -56,7 +56,6 @@ module @passThroughLine_aie2 { //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words]) aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @inOF, id = 1 : i64 } : memref<2073600xi32> aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @outOF, id = 0 : i64 } : memref<2073600xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir index dd66475ca5..394ba07bd8 100644 --- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir +++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir @@ -55,7 +55,6 @@ module @passThroughLine_aie2 { //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words]) aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<1152xi32> aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<1152xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py b/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py index 6925e6bd2d..6ad968e7eb 100644 --- a/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py +++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py @@ -60,7 +60,6 @@ def sequence(inTensor, notUsed, outTensor): npu_dma_memcpy_nd( metadata="in", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 48] ) - npu_sync(column=0, row=0, direction=0, channel=0) res = ctx.module.operation.verify() if res == True: diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py index 989808392c..52c5923d2f 100644 --- a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py +++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py @@ -64,7 +64,6 @@ def sequence(inTensor, notUsed, outTensor): npu_dma_memcpy_nd( metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 48] ) - npu_sync(column=0, row=0, direction=0, channel=0) res = ctx.module.operation.verify() if res == True: diff --git a/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py b/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py index b8c264ea28..836c2fbba6 100644 --- a/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py +++ b/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py @@ -101,7 +101,6 @@ def sequence(inTensor, notUsed, outTensor): npu_dma_memcpy_nd( metadata="in", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 48] ) - npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_guide/section-4/section-4b/aie2.py b/programming_guide/section-4/section-4b/aie2.py index 910d4b1a94..bb9a742dff 100644 --- a/programming_guide/section-4/section-4b/aie2.py +++ b/programming_guide/section-4/section-4b/aie2.py @@ -82,7 +82,6 @@ def sequence(A, F, C): npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 4096]) npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, 4096]) npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1]) - npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir index 9641e2ac7c..214c4c4d4f 100644 --- a/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir +++ b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir @@ -105,7 +105,6 @@ module { func.func @bobsyouruncle(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) { aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_in} : memref<64xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_out} : memref<64xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } diff --git a/test/npu-xrt/add_314_using_dma_op/aie.mlir b/test/npu-xrt/add_314_using_dma_op/aie.mlir index 8850e79ee0..5d408c6a2e 100644 --- a/test/npu-xrt/add_314_using_dma_op/aie.mlir +++ b/test/npu-xrt/add_314_using_dma_op/aie.mlir @@ -70,7 +70,6 @@ module { %c64_i64 = arith.constant 64 : i64 aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } diff --git a/test/npu-xrt/add_one_objFifo/aie.mlir b/test/npu-xrt/add_one_objFifo/aie.mlir index f05628a742..487901f383 100644 --- a/test/npu-xrt/add_one_objFifo/aie.mlir +++ b/test/npu-xrt/add_one_objFifo/aie.mlir @@ -49,7 +49,6 @@ module { %c64 = arith.constant 64 : i64 aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> - aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } return } } diff --git a/test/npu-xrt/add_one_two/aie1.mlir b/test/npu-xrt/add_one_two/aie1.mlir index 676dda4305..ab4bcba4e5 100644 --- a/test/npu-xrt/add_one_two/aie1.mlir +++ b/test/npu-xrt/add_one_two/aie1.mlir @@ -46,7 +46,6 @@ module { %c64 = arith.constant 64 : i64 aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> - aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } return } } diff --git a/test/npu-xrt/add_one_using_dma/aie.mlir b/test/npu-xrt/add_one_using_dma/aie.mlir index b8a319c431..94a88d3cf6 100644 --- a/test/npu-xrt/add_one_using_dma/aie.mlir +++ b/test/npu-xrt/add_one_using_dma/aie.mlir @@ -81,7 +81,6 @@ module { %c64_i64 = arith.constant 64 : i64 aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } diff --git a/test/npu-xrt/cascade_flows/aie.mlir b/test/npu-xrt/cascade_flows/aie.mlir index 54fbd77091..a71c36936f 100644 --- a/test/npu-xrt/cascade_flows/aie.mlir +++ b/test/npu-xrt/cascade_flows/aie.mlir @@ -65,7 +65,6 @@ module { %c64 = arith.constant 64 : i64 aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> - aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } return } } diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir index 3112c0c05e..c39198a865 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir @@ -641,7 +641,6 @@ module { aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } {sym_name = "segment_0"} diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir index fb58fa0fb0..60e48e95ba 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir @@ -489,7 +489,6 @@ module { aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } {sym_name = "segment_0"} diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir index e6d4d7df97..720ec3960a 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir @@ -194,7 +194,6 @@ module { aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } {sym_name = "segment_0"} diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir index 6ffc1cfda2..878bd6b857 100644 --- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir +++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir @@ -525,7 +525,6 @@ module { aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } {sym_name = "segment_0"} diff --git a/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir b/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir index f589386ede..d1af86df00 100644 --- a/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir +++ b/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir @@ -119,7 +119,6 @@ module { aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 2 : i64, metadata = @inB} : memref<8192xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c4096_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 3 : i64, metadata = @inA} : memref<8192xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 4 : i64, metadata = @inB} : memref<8192xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } diff --git a/test/npu-xrt/two_col/aie.mlir b/test/npu-xrt/two_col/aie.mlir index 4dac457a33..aed21f3cd7 100644 --- a/test/npu-xrt/two_col/aie.mlir +++ b/test/npu-xrt/two_col/aie.mlir @@ -142,7 +142,6 @@ module { aiex.npu.rtp_write(1, 5, 1, 0) { buffer_sym_name = "rtp3" } aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<2048xi32> aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<2048xi32> - aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } return } } diff --git a/test/npu-xrt/vector_scalar_using_dma/aie.mlir b/test/npu-xrt/vector_scalar_using_dma/aie.mlir index 81ccccffbd..c2a42f5a6c 100644 --- a/test/npu-xrt/vector_scalar_using_dma/aie.mlir +++ b/test/npu-xrt/vector_scalar_using_dma/aie.mlir @@ -71,7 +71,6 @@ module { %c4096_i64 = arith.constant 4096 : i64 aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @out} : memref<4096xi32> aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @in} : memref<4096xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return }