diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
index 564d011e36..7ef7669a50 100644
--- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
@@ -419,6 +419,46 @@ struct DmaWaitToNpuPattern : OpConversionPattern<NpuDmaWaitOp> {
   }
 };
 
+std::optional<AIE::ShimDMAAllocationOp>
+getAllocOpForSymbol(SmallVector<AIE::ShimDMAAllocationOp> shimDmaAllocOps,
+                    StringRef sym_name) {
+  for (auto shimDmaAllocOp : shimDmaAllocOps)
+    if (shimDmaAllocOp.getSymName() == sym_name)
+      return shimDmaAllocOp;
+  return std::nullopt;
+}
+
+void insertNpuSyncOpForResults(AIE::DeviceOp device) {
+  SmallVector<AIE::ShimDMAAllocationOp> shimDmaAllocOps;
+  device.walk([&](AIE::ShimDMAAllocationOp shimDmaAllocOp) {
+    shimDmaAllocOps.push_back(shimDmaAllocOp);
+  });
+  device.walk([&](mlir::func::FuncOp f) {
+    SmallVector<AIEX::NpuDmaMemcpyNdOp> dmas;
+    Operation *returnOp = nullptr;
+    f.walk([&](mlir::func::ReturnOp op) { returnOp = op.getOperation(); });
+    f.walk([&](AIEX::NpuDmaMemcpyNdOp dma) { dmas.push_back(dma); });
+    for (auto dma : dmas) {
+      if (auto infoOp =
+              getAllocOpForSymbol(shimDmaAllocOps, dma.getMetadata())) {
+        if (infoOp->getChannelDir() == AIE::DMAChannelDir::S2MM) {
+          // Found dma op copying results to host
+          OpBuilder builder(dma);
+          auto col = builder.getI32IntegerAttr(infoOp->getCol());
+          auto row = builder.getI32IntegerAttr(0);
+          auto dir = builder.getI32IntegerAttr(0);
+          auto chan = builder.getI32IntegerAttr(infoOp->getChannelIndex());
+          auto col_num = builder.getI32IntegerAttr(1);
+          auto row_num = builder.getI32IntegerAttr(1);
+          builder.setInsertionPoint(returnOp);
+          builder.create<AIEX::NpuSyncOp>(dma->getLoc(), col, row, dir, chan,
+                                          col_num, row_num);
+        }
+      }
+    }
+  });
+}
+
 struct AIEDmaToNpuPass : AIEDmaToNpuBase<AIEDmaToNpuPass> {
   void runOnOperation() override {
 
@@ -441,6 +481,9 @@ struct AIEDmaToNpuPass : AIEDmaToNpuBase<AIEDmaToNpuPass> {
     patterns.insert<PushToNpuPattern>(&getContext(), cachingGetter);
     patterns.insert<RtpToNpuPattern>(&getContext());
 
+    // Insert sync op after copying data out to host
+    insertNpuSyncOpForResults(device);
+
     if (failed(applyPartialConversion(device, target, std::move(patterns))))
       signalPassFailure();
   }
diff --git a/programming_examples/basic/dma_transpose/aie2.py b/programming_examples/basic/dma_transpose/aie2.py
index 7af771d5b7..495a68975d 100644
--- a/programming_examples/basic/dma_transpose/aie2.py
+++ b/programming_examples/basic/dma_transpose/aie2.py
@@ -59,7 +59,6 @@ def sequence(A, B, C):
                 npu_dma_memcpy_nd(
                     metadata="in", bd_id=1, mem=A, sizes=[1, K, M, 1], strides=[1, 1, K]
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
index 6b27d9f9e3..08658c9b4f 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
@@ -204,9 +204,6 @@ def sequence(A, B, C):
                         strides=[0, 0, 0],
                     )
 
-                for i in range(n_cores):
-                    npu_sync(column=i, row=0, direction=0, channel=0)
-
     print(ctx.module)
 
 
diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
index 909fba0c43..9d78aba889 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -227,8 +227,6 @@ def sequence(A, B, C):
                             strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s],
                         )
 
-                    npu_sync(column=0, row=0, direction=0, channel=0)
-
     print(ctx.module)
 
 
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
index 76453f4b94..548c1f2d98 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -366,8 +366,6 @@ def sequence(A, B, C):
                                 sizes=[N_div_n_div_n_cols, K_div_k, k, n_in_i32s],
                                 strides=[n_x_n_cols_in_i32s, k_x_N_in_i32s, N_in_i32s],
                             )
-                    for i in range(n_cols):
-                        npu_sync(column=i, row=0, direction=0, channel=0)
 
     # print(ctx.module.operation.verify())
     print(ctx.module)
diff --git a/programming_examples/basic/matrix_scalar_add/aie2.py b/programming_examples/basic/matrix_scalar_add/aie2.py
index 52a71688fc..a774dd076d 100644
--- a/programming_examples/basic/matrix_scalar_add/aie2.py
+++ b/programming_examples/basic/matrix_scalar_add/aie2.py
@@ -94,7 +94,6 @@ def sequence(inTensor, notUsed, outTensor):
                 sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
                 strides=[1, 1, IMAGE_WIDTH],
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/programming_examples/basic/passthrough_dmas/aie2.py b/programming_examples/basic/passthrough_dmas/aie2.py
index 10becd4e27..413c9c27be 100644
--- a/programming_examples/basic/passthrough_dmas/aie2.py
+++ b/programming_examples/basic/passthrough_dmas/aie2.py
@@ -64,7 +64,6 @@ def core_body():
             def sequence(A, B, C):
                 npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
                 npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
index 4fe9a7ed9b..759a205baa 100644
--- a/programming_examples/basic/passthrough_kernel/aie2.py
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -85,7 +85,6 @@ def sequence(inTensor, outTensor, notUsed):
                 mem=outTensor,
                 sizes=[1, 1, 1, tensorSizeInInt32s],
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:
diff --git a/programming_examples/basic/vector_exp/aie2.py b/programming_examples/basic/vector_exp/aie2.py
index af58a6392b..99d26ef316 100644
--- a/programming_examples/basic/vector_exp/aie2.py
+++ b/programming_examples/basic/vector_exp/aie2.py
@@ -113,7 +113,6 @@ def sequence(A, C):
             npu_dma_memcpy_nd(
                 metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/programming_examples/basic/vector_reduce_add/aie2.py b/programming_examples/basic/vector_reduce_add/aie2.py
index fe6f049984..564d875a9a 100644
--- a/programming_examples/basic/vector_reduce_add/aie2.py
+++ b/programming_examples/basic/vector_reduce_add/aie2.py
@@ -70,7 +70,6 @@ def core_body():
         def sequence(A, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/programming_examples/basic/vector_reduce_max/aie2.py b/programming_examples/basic/vector_reduce_max/aie2.py
index 31ee9f181a..2c55132579 100644
--- a/programming_examples/basic/vector_reduce_max/aie2.py
+++ b/programming_examples/basic/vector_reduce_max/aie2.py
@@ -70,7 +70,6 @@ def core_body():
         def sequence(A, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/programming_examples/basic/vector_reduce_min/aie2.py b/programming_examples/basic/vector_reduce_min/aie2.py
index 430ad5f9ef..8ae8a92af5 100644
--- a/programming_examples/basic/vector_reduce_min/aie2.py
+++ b/programming_examples/basic/vector_reduce_min/aie2.py
@@ -70,7 +70,6 @@ def core_body():
         def sequence(A, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/programming_examples/basic/vector_scalar_add/aie2.py b/programming_examples/basic/vector_scalar_add/aie2.py
index 46b44308b6..8562fc5745 100644
--- a/programming_examples/basic/vector_scalar_add/aie2.py
+++ b/programming_examples/basic/vector_scalar_add/aie2.py
@@ -71,7 +71,6 @@ def sequence(inTensor, outTensor):
             npu_dma_memcpy_nd(
                 metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, PROBLEM_SIZE]
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 # Declares that subsequent code is in mlir-aie context
diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py
index 8d367ced50..82126016aa 100644
--- a/programming_examples/basic/vector_scalar_mul/aie2.py
+++ b/programming_examples/basic/vector_scalar_mul/aie2.py
@@ -101,7 +101,6 @@ def sequence(A, F, C):
             )
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N_in_i32s])
             npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:
diff --git a/programming_examples/basic/vector_vector_add/aie2.py b/programming_examples/basic/vector_vector_add/aie2.py
index 62ad20534c..2e0fdbfe22 100644
--- a/programming_examples/basic/vector_vector_add/aie2.py
+++ b/programming_examples/basic/vector_vector_add/aie2.py
@@ -81,7 +81,6 @@ def sequence(A, B, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/programming_examples/basic/vector_vector_modulo/aie2.py b/programming_examples/basic/vector_vector_modulo/aie2.py
index 83d5675e85..a21bb87c16 100644
--- a/programming_examples/basic/vector_vector_modulo/aie2.py
+++ b/programming_examples/basic/vector_vector_modulo/aie2.py
@@ -81,7 +81,6 @@ def sequence(A, B, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/programming_examples/basic/vector_vector_mul/aie2.py b/programming_examples/basic/vector_vector_mul/aie2.py
index fa07bbe58a..4b20bdfaa2 100644
--- a/programming_examples/basic/vector_vector_mul/aie2.py
+++ b/programming_examples/basic/vector_vector_mul/aie2.py
@@ -81,7 +81,6 @@ def sequence(A, B, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/programming_examples/ml/bottleneck/aie2.py b/programming_examples/ml/bottleneck/aie2.py
index 669aacb415..b4f7620d34 100644
--- a/programming_examples/ml/bottleneck/aie2.py
+++ b/programming_examples/ml/bottleneck/aie2.py
@@ -631,8 +631,6 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                     sizes=[1, 1, 1, totalWeightsSize32b],
                 )
 
-                npu_sync(column=0, row=0, direction=0, channel=0)
-
     print(ctx.module)
 
 
diff --git a/programming_examples/ml/conv2d/aie2.py b/programming_examples/ml/conv2d/aie2.py
index 11e92f55c2..189696067b 100644
--- a/programming_examples/ml/conv2d/aie2.py
+++ b/programming_examples/ml/conv2d/aie2.py
@@ -168,7 +168,6 @@ def sequence(I, W, O):
                     mem=W,
                     sizes=[1, 1, 1, weightsInInt32s],
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     #    print(ctx.module.operation.verify())
     print(ctx.module)
diff --git a/programming_examples/ml/conv2d_fused_relu/aie2.py b/programming_examples/ml/conv2d_fused_relu/aie2.py
index faafaf4d86..d0e591ad1c 100644
--- a/programming_examples/ml/conv2d_fused_relu/aie2.py
+++ b/programming_examples/ml/conv2d_fused_relu/aie2.py
@@ -254,7 +254,6 @@ def sequence(I, W, O):
                     mem=W,
                     sizes=[1, 1, 1, weightsInInt32s],
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     #    print(ctx.module.operation.verify())
     print(ctx.module)
diff --git a/programming_examples/ml/eltwise_add/aie2.py b/programming_examples/ml/eltwise_add/aie2.py
index 354e9f78d1..936da3f2bf 100644
--- a/programming_examples/ml/eltwise_add/aie2.py
+++ b/programming_examples/ml/eltwise_add/aie2.py
@@ -152,7 +152,6 @@ def sequence(A, B, C):
             npu_dma_memcpy_nd(
                 metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s]
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:
diff --git a/programming_examples/ml/eltwise_mul/aie2.py b/programming_examples/ml/eltwise_mul/aie2.py
index 5808d0c998..0f584ef38d 100644
--- a/programming_examples/ml/eltwise_mul/aie2.py
+++ b/programming_examples/ml/eltwise_mul/aie2.py
@@ -153,7 +153,6 @@ def sequence(A, B, C):
             npu_dma_memcpy_nd(
                 metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s]
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:
diff --git a/programming_examples/ml/relu/aie2.py b/programming_examples/ml/relu/aie2.py
index e4da4eafdf..a55f7cb179 100644
--- a/programming_examples/ml/relu/aie2.py
+++ b/programming_examples/ml/relu/aie2.py
@@ -124,7 +124,6 @@ def sequence(A, C):
             npu_dma_memcpy_nd(
                 metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:
diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
index 94f5888512..150128d887 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
@@ -986,8 +986,6 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                     sizes=[1, 1, 1, totalWeightsSize32b_rest],
                 )
 
-                npu_sync(column=1, row=0, direction=0, channel=0)
-
     res = ctx.module.operation.verify()
     if res == True:
         print(ctx.module)
diff --git a/programming_examples/ml/softmax/aie2.py b/programming_examples/ml/softmax/aie2.py
index 47d60adf6a..269696f4e6 100755
--- a/programming_examples/ml/softmax/aie2.py
+++ b/programming_examples/ml/softmax/aie2.py
@@ -128,7 +128,6 @@ def sequence(A, C):
             npu_dma_memcpy_nd(
                 metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:
diff --git a/programming_examples/vision/color_detect/aie2_colorDetect.py b/programming_examples/vision/color_detect/aie2_colorDetect.py
index 19e4e04ca9..736ef2a6b3 100644
--- a/programming_examples/vision/color_detect/aie2_colorDetect.py
+++ b/programming_examples/vision/color_detect/aie2_colorDetect.py
@@ -266,7 +266,6 @@ def sequence(I, B, O):
                     mem=O,
                     sizes=[1, 1, 1, height * lineWidthInInt32s],
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
index 1215a4ddd0..7766cee3c5 100644
--- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py
+++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
@@ -284,7 +284,6 @@ def sequence(inTensor, notUsed, outTensor):
                     mem=outTensor,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     # print(ctx.module.operation.verify())
     print(ctx.module)
diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect.py b/programming_examples/vision/edge_detect/aie2_edgeDetect.py
index 1af069d94e..c927cdd8c0 100644
--- a/programming_examples/vision/edge_detect/aie2_edgeDetect.py
+++ b/programming_examples/vision/edge_detect/aie2_edgeDetect.py
@@ -312,7 +312,6 @@ def sequence(I, B, O):
                     mem=I,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     #    print(ctx.module.operation.verify())
     print(ctx.module)
diff --git a/programming_examples/vision/vision_passthrough/aie2.py b/programming_examples/vision/vision_passthrough/aie2.py
index 8d568af388..dad0ee09ff 100644
--- a/programming_examples/vision/vision_passthrough/aie2.py
+++ b/programming_examples/vision/vision_passthrough/aie2.py
@@ -165,7 +165,6 @@ def sequence(inTensor, notUsed, outTensor):
                     mem=outTensor,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir
index 0621e0b622..13c36fbf9e 100644
--- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir
+++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir
@@ -55,7 +55,6 @@ module @passThroughLine_aie2 {
             //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
             aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<518400xi32>
             aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<518400xi32>
-            aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
             return
         }
     }
diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir
index c2c31b0d9b..67efe2e747 100644
--- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir
+++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir
@@ -56,7 +56,6 @@ module @passThroughLine_aie2 {
             //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
             aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @inOF, id = 1 : i64 } : memref<2073600xi32>
             aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @outOF, id = 0 : i64 } : memref<2073600xi32>
-            aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
             return
         }
     }
diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir
index dd66475ca5..394ba07bd8 100644
--- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir
+++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir
@@ -55,7 +55,6 @@ module @passThroughLine_aie2 {
             //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
             aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<1152xi32>
             aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<1152xi32>
-            aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
             return
         }
     }
diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py b/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py
index 6925e6bd2d..6ad968e7eb 100644
--- a/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py
+++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py
@@ -60,7 +60,6 @@ def sequence(inTensor, notUsed, outTensor):
                 npu_dma_memcpy_nd(
                     metadata="in", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 48]
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     res = ctx.module.operation.verify()
     if res == True:
diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py
index 989808392c..52c5923d2f 100644
--- a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py
+++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py
@@ -64,7 +64,6 @@ def sequence(inTensor, notUsed, outTensor):
                 npu_dma_memcpy_nd(
                     metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 48]
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     res = ctx.module.operation.verify()
     if res == True:
diff --git a/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py b/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py
index b8c264ea28..836c2fbba6 100644
--- a/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py
+++ b/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py
@@ -101,7 +101,6 @@ def sequence(inTensor, notUsed, outTensor):
                 npu_dma_memcpy_nd(
                     metadata="in", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 48]
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_guide/section-4/section-4b/aie2.py b/programming_guide/section-4/section-4b/aie2.py
index 910d4b1a94..bb9a742dff 100644
--- a/programming_guide/section-4/section-4b/aie2.py
+++ b/programming_guide/section-4/section-4b/aie2.py
@@ -82,7 +82,6 @@ def sequence(A, F, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 4096])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, 4096])
             npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir
index 9641e2ac7c..214c4c4d4f 100644
--- a/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir
+++ b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir
@@ -105,7 +105,6 @@ module {
     func.func @bobsyouruncle(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_in} : memref<64xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_out} : memref<64xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
   }
diff --git a/test/npu-xrt/add_314_using_dma_op/aie.mlir b/test/npu-xrt/add_314_using_dma_op/aie.mlir
index 8850e79ee0..5d408c6a2e 100644
--- a/test/npu-xrt/add_314_using_dma_op/aie.mlir
+++ b/test/npu-xrt/add_314_using_dma_op/aie.mlir
@@ -70,7 +70,6 @@ module {
       %c64_i64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
 
diff --git a/test/npu-xrt/add_one_objFifo/aie.mlir b/test/npu-xrt/add_one_objFifo/aie.mlir
index f05628a742..487901f383 100644
--- a/test/npu-xrt/add_one_objFifo/aie.mlir
+++ b/test/npu-xrt/add_one_objFifo/aie.mlir
@@ -49,7 +49,6 @@ module {
       %c64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
-      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
       return
     }
   }
diff --git a/test/npu-xrt/add_one_two/aie1.mlir b/test/npu-xrt/add_one_two/aie1.mlir
index 676dda4305..ab4bcba4e5 100644
--- a/test/npu-xrt/add_one_two/aie1.mlir
+++ b/test/npu-xrt/add_one_two/aie1.mlir
@@ -46,7 +46,6 @@ module {
       %c64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
-      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
       return
     }
   }
diff --git a/test/npu-xrt/add_one_using_dma/aie.mlir b/test/npu-xrt/add_one_using_dma/aie.mlir
index b8a319c431..94a88d3cf6 100644
--- a/test/npu-xrt/add_one_using_dma/aie.mlir
+++ b/test/npu-xrt/add_one_using_dma/aie.mlir
@@ -81,7 +81,6 @@ module {
       %c64_i64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
 
diff --git a/test/npu-xrt/cascade_flows/aie.mlir b/test/npu-xrt/cascade_flows/aie.mlir
index 54fbd77091..a71c36936f 100644
--- a/test/npu-xrt/cascade_flows/aie.mlir
+++ b/test/npu-xrt/cascade_flows/aie.mlir
@@ -65,7 +65,6 @@ module {
       %c64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
-      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
       return
     }
   }
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir
index 3112c0c05e..c39198a865 100644
--- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir
@@ -641,7 +641,6 @@ module {
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
   } {sym_name = "segment_0"}
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir
index fb58fa0fb0..60e48e95ba 100644
--- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir
@@ -489,7 +489,6 @@ module {
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
   } {sym_name = "segment_0"}
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir
index e6d4d7df97..720ec3960a 100644
--- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir
@@ -194,7 +194,6 @@ module {
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
   } {sym_name = "segment_0"}
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir
index 6ffc1cfda2..878bd6b857 100644
--- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir
@@ -525,7 +525,6 @@ module {
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
   } {sym_name = "segment_0"}
diff --git a/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir b/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir
index f589386ede..d1af86df00 100644
--- a/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir
@@ -119,7 +119,6 @@ module {
       aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 2 : i64, metadata = @inB} : memref<8192xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c4096_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 3 : i64, metadata = @inA} : memref<8192xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 4 : i64, metadata = @inB} : memref<8192xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
 
diff --git a/test/npu-xrt/two_col/aie.mlir b/test/npu-xrt/two_col/aie.mlir
index 4dac457a33..aed21f3cd7 100644
--- a/test/npu-xrt/two_col/aie.mlir
+++ b/test/npu-xrt/two_col/aie.mlir
@@ -142,7 +142,6 @@ module {
       aiex.npu.rtp_write(1, 5, 1, 0) { buffer_sym_name = "rtp3" }
       aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<2048xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<2048xi32>
-      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
       return
     }
   }
diff --git a/test/npu-xrt/vector_scalar_using_dma/aie.mlir b/test/npu-xrt/vector_scalar_using_dma/aie.mlir
index 81ccccffbd..c2a42f5a6c 100644
--- a/test/npu-xrt/vector_scalar_using_dma/aie.mlir
+++ b/test/npu-xrt/vector_scalar_using_dma/aie.mlir
@@ -71,7 +71,6 @@ module {
       %c4096_i64 = arith.constant 4096 : i64
       aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @out} : memref<4096xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @in} : memref<4096xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }