iree-org · zjgarvey · Dec 5, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 4, 2024
@@ -63,12 +63,35 @@ struct DetachElementwisePattern
       return failure();
     }
     auto outputType = llvm::cast<RankedTensorType>(outputOperand.getType());
+    int64_t outputRank = outputType.getRank();
     if (!outputType.getElementType().isIntOrFloat())
       return failure();
     auto elementType = outputType.getElementType();
 
     Location loc = linalgOp.getLoc();
 
+    // verify the original output affine map is parallel
+    auto outputMap = mlir::compressUnusedDims(
+        linalgOp.getMatchingIndexingMap(outputOperands.front()));
+    SmallVector<utils::IteratorType> iterators;
+    iterators.reserve(outputMap.getNumResults());
+    for (int i = 0, e = outputMap.getNumResults(); i < e; ++i) {
+      auto expr = dyn_cast<AffineDimExpr>(outputMap.getResult(i));
+      if (!expr)
+        return rewriter.notifyMatchFailure(
+            linalgOp, "output affine map has a non dim expression at " +
+                          std::to_string(i));
+      int pos = expr.getPosition();
+      auto attr = linalgOp.getIteratorTypesArray()[pos];
+      if (!linalg::isParallelIterator(attr))
+        return rewriter.notifyMatchFailure(
+            linalgOp, "output iterator type is not parallel at position " +
+                          std::to_string(pos));
+      iterators.push_back(attr);
+    }
+
+    SmallVector<AffineMap> maps(3, rewriter.getMultiDimIdentityMap(outputRank));
+
     // Create a zero tensor as the new output tensor operand to the Linalg
     // contraction op.
     SmallVector<OpFoldResult> mixedSizes =
@@ -84,24 +107,6 @@ struct DetachElementwisePattern
     rewriter.modifyOpInPlace(linalgOp,
                              [&]() { linalgOp.setDpsInitOperand(0, fill); });
 
-    auto outputMap = mlir::compressUnusedDims(
-        linalgOp.getMatchingIndexingMap(outputOperands.front()));
-    // Only support identity map for output access for now; this is the case for
-    // all existing contraction/convolution ops.
-    if (!outputMap.isIdentity())
-      return failure();
-    SmallVector<AffineMap> maps(3, outputMap);
-
-    SmallVector<utils::IteratorType> iterators;
-    iterators.reserve(outputMap.getNumResults());
-    for (int i = 0, e = outputMap.getNumResults(); i < e; ++i) {
-      int pos = cast<AffineDimExpr>(outputMap.getResult(i)).getPosition();
-      auto attr = linalgOp.getIteratorTypesArray()[pos];
-      if (!linalg::isParallelIterator(attr))
-        return failure();
-      iterators.push_back(attr);
-    }
-
     // Create a generic op to add back the original output tensor operand.
     rewriter.setInsertionPointAfter(linalgOp);
     auto genericOp = rewriter.create<linalg::GenericOp>(

@@ -101,6 +101,27 @@ util.func public @conv(%input: tensor<1x225x225x3xf32>, %filter: tensor<3x3x3x32
 
 // -----
 
+util.func public @depthwise_conv(%arg0: tensor<1x96x62x62xf32>, %arg1: tensor<96x7x7xf32>, %arg2: tensor<96xf32>) -> tensor<1x96x56x56xf32> {
+  %0 = tensor.empty() : tensor<1x96x56x56xf32>
+  %broadcasted = linalg.broadcast ins(%arg2 : tensor<96xf32>) outs(%0 : tensor<1x96x56x56xf32>) dimensions = [0, 2, 3]
+  %1 = linalg.depthwise_conv_2d_nchw_chw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<1x96x62x62xf32>, tensor<96x7x7xf32>) outs(%broadcasted : tensor<1x96x56x56xf32>) -> tensor<1x96x56x56xf32>
+  util.return %1 : tensor<1x96x56x56xf32>
+}
+
+// CHECK-LABEL: util.func public @depthwise_conv
+//  CHECK-SAME: (%[[INPUT:.+]]: tensor<1x96x62x62xf32>, %[[FILTER:.+]]: tensor<96x7x7xf32>, %[[BIAS:.+]]: tensor<96xf32>)
+//       CHECK:   %[[INIT:.+]] = linalg.broadcast
+//  CHECK-SAME:     ins(%[[BIAS]] :
+//       CHECK:   %[[FILL:.+]] = linalg.fill
+//       CHECK:   %[[CONV:.+]] = linalg.depthwise_conv_2d_nchw_chw
+//  CHECK-SAME:     ins(%[[INPUT]], %[[FILTER]]
+//  CHECK-SAME:     outs(%[[FILL]]
+//       CHECK:   linalg.generic
+//  CHECK-SAME:     ins(%[[CONV]], %[[INIT]]
+//  CHECK-SAME:     outs(%[[FILL]]
+
+// -----
+
 util.func public @keep_fill(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index