Enable end to end DPS testing

Implement python binding changes to allow execute function return multiple returns. Update tests to use non-DPS style calling convention. Also, enable end to end lowering by enabling conversion of closed alloc group op to tensorrt dialect. Miscellaneous fixes: 1. Add missing handling of `CallAllocOp` in EliminateShapeOps pass. 2. Skip non ranked tensor type function arguments while collecting host tensor arguments. 3. Temporarily add a pass to remove clone operation in MemRefToExecutor dialect conversion. 4. Relax memref creation for empty shape tensors. 5. Fix memref life returned from Lua function results. This required session allocator to track returned memref. Also, address Fix incorrect indexing into output memref results Return error status instead of silently erroring out during TensorRT weight conversion Address review comments
NVIDIA · Feb 5, 2025 · b25f807 · b25f807
1 parent 87e5869
commit b25f807
Show file tree

Hide file tree

Showing 20 changed files with 734 additions and 105 deletions.
diff --git a/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td b/mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td
@@ -248,8 +248,8 @@ def StablehloClusteringPass : Pass<"stablehlo-clustering", "::mlir::ModuleOp"> {
     Option<"entrypoint", "entrypoint", "std::string", "\"\"",
       "the name of the entrypoint function; if empty then the clustering runs"
       " on all functions">,
-    Option<"enableNonDPSReturns",
-      "enable-non-dps-returns", "bool", "false",
+    Option<"forceEntrypointsReturnAllocs",
+      "force-entrypoints-return-allocs", "bool", "false",
       "allow backend clusters to directly allocate outputs">,
     Option<"disableCreateShapeFuncPass", "disable-create-shape-func-pass", "bool", "false",
       "don't apply create shape to func pass in TensorRT clusters">
@@ -331,7 +331,7 @@ def CreateClosedRegionsPass : Pass<"plan-create-closed-regions", "::mlir::Module
       "(used only in testing) specifies to outline regions by walking in "
       " pre-order; used for verifying results are not sensitive "
       "to traversal order">,
-    Option<"enableNonDPSReturns", "enable-non-dps-returns", "bool",
+    Option<"forceEntrypointsReturnAllocs", "force-entrypoints-return-allocs", "bool",
            /*default=*/"false",
            "Allow backend clusters to directly allocate outputs">
   ];

diff --git a/mlir-tensorrt/compiler/lib/Compiler/StablehloToExecutable/StablehloToExecutable.cpp b/mlir-tensorrt/compiler/lib/Compiler/StablehloToExecutable/StablehloToExecutable.cpp
@@ -130,7 +130,9 @@ void StablehloToExecutableTask::buildPostClusteringPipeline(
 
   // Perform bufferization.
   pm.addPass(createMemRefCastEliminationPass());
-  pm.addPass(plan::createPlanAllocTensorsPass());
+  plan::PlanAllocTensorsPassOptions allocTensorOpts{};
+  allocTensorOpts.forceEntrypointsReturnAllocs = opts.forceEntrypointsReturnAllocs;
+  pm.addPass(plan::createPlanAllocTensorsPass(allocTensorOpts));
   pm.addPass(plan::createPlanBufferizePass());
   pm.addPass(createMemRefCastEliminationPass());
   pm.addPass(createCanonicalizerPass());

diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CMakeLists.txt b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CMakeLists.txt
@@ -37,6 +37,7 @@ add_mlir_tensorrt_library(MLIRTensorRTPlanTransforms
   MLIRTensorRTStablehloScalarToArith
   MLIRTensorRTStablehloToTensorRT
   MLIRTensorRTTensorRTRuntimeDialect
+  MLIRBufferizationToMemRef
   MLIRTransforms
   StablehloOps
 )
diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CreateClosedRegions.cpp b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/CreateClosedRegions.cpp
@@ -561,12 +561,12 @@ createInlineClosedAllocGroupOp(RewriterBase &rewriter, plan::InlineGroupOp op,
 static LogicalResult createClosedGroupOp(RewriterBase &rewriter,
                                          plan::InlineGroupOp op,
                                          DataFlowSolver &solver,
-                                         bool enableNonDPSReturns) {
+                                         bool forceEntrypointsReturnAllocs) {
   OpBuilder::InsertionGuard g(rewriter);
 
   // Materialize destination operands if not using non-DPS call convention.
   SmallVector<DestinationOperandMaterializationResult> destinationOperands;
-  if (!enableNonDPSReturns)
+  if (!forceEntrypointsReturnAllocs)
     if (failed(materializeDestinationOperands(rewriter, op, solver,
                                               destinationOperands)))
       return failure();
@@ -581,7 +581,7 @@ static LogicalResult createClosedGroupOp(RewriterBase &rewriter,
 
   // Create and populate the appropriate closed group op based on call
   // convention.
-  if (!enableNonDPSReturns)
+  if (!forceEntrypointsReturnAllocs)
     return createInlineClosedGroupOp(rewriter, op, solver, inputs,
                                      destinationOperands);
   return createInlineClosedAllocGroupOp(rewriter, op, solver, inputs);
@@ -629,7 +629,7 @@ class CreateClosedRegionsPass
     IRRewriter rewriter(ctx);
     for (InlineGroupOp groupOp : llvm::make_early_inc_range(groupOps)) {
       if (failed(createClosedGroupOp(rewriter, groupOp, solver,
-                                     enableNonDPSReturns)))
+                                     forceEntrypointsReturnAllocs)))
         return signalPassFailure();
     }
   }

diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/Passes.cpp b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/Passes.cpp
@@ -24,6 +24,7 @@
 //===----------------------------------------------------------------------===//
 #include "mlir-tensorrt/Dialect/Plan/Transforms/Passes.h"
 #include "mlir-tensorrt/Transforms/Passes.h"
+#include "mlir/Conversion/BufferizationToMemRef/BufferizationToMemRef.h"
 #include "mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h"
 #include "mlir/Dialect/Bufferization/Pipelines/Passes.h"
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"
@@ -48,7 +49,7 @@ void plan::buildPlanSegmentationPipeline(
       plan::createPlanPopulateFunctionBoundsAttributesPass());
   pm.addPass(plan::createStablehloClusteringPass(opts));
   plan::CreateClosedRegionsPassOptions closedRegionOptions{};
-  closedRegionOptions.enableNonDPSReturns = opts.enableNonDPSReturns;
+  closedRegionOptions.forceEntrypointsReturnAllocs = opts.forceEntrypointsReturnAllocs;
   pm.addPass(plan::createCreateClosedRegionsPass(closedRegionOptions));
   pm.addPass(plan::createOutlineClustersPass());
   pm.addPass(mlir::createFuncExtDuplicateFunctionEliminationPass());
@@ -80,6 +81,7 @@ void plan::buildPlanBufferDeallocationPipeline(
   pm.addPass(createCanonicalizerPass());
   pm.addPass(bufferization::createBufferDeallocationSimplificationPass());
   pm.addPass(bufferization::createLowerDeallocationsPass());
+  pm.addPass(mlir::createBufferizationToMemRefPass());
   pm.addPass(createCSEPass());
   pm.addPass(createCanonicalizerPass());
 }
@@ -112,19 +114,17 @@ struct PlanBufferizationPipelineCliOpts
 // Register pipelines.
 
 void plan::registerPlanDialectPipelines() {
-  PassPipelineRegistration<PlanBufferizationPipelineCliOpts>
-      executorBufferizationPipeline(
-          "plan-bufferize-pipeline",
-          "perform bufferization and standard pre/post processing passes",
-          [](OpPassManager &pm, const PlanBufferizationPipelineCliOpts &opts) {
-            PlanAllocTensorsPassOptions allocTensorOpts{};
-            allocTensorOpts.forceEntrypointsReturnAllocs =
-                opts.forceEntrypointsReturnAllocs;
-            buildPlanBufferizationPipeline(pm, allocTensorOpts);
-            buildPlanBufferOptimizationPipeline(pm);
-            buildPlanBufferDeallocationPipeline(
-                pm, bufferization::DeallocationOptions{false});
-          });
+  PassPipelineRegistration<PlanBufferizationPipelineCliOpts> executorBufferizationPipeline(
+      "plan-bufferize-pipeline",
+      "perform bufferization and standard pre/post processing passes",
+      [](OpPassManager &pm, const PlanBufferizationPipelineCliOpts &opts) {
+        PlanAllocTensorsPassOptions allocTensorOpts{};
+        allocTensorOpts.forceEntrypointsReturnAllocs = opts.forceEntrypointsReturnAllocs;
+        buildPlanBufferizationPipeline(pm, allocTensorOpts);
+        buildPlanBufferOptimizationPipeline(pm);
+        buildPlanBufferDeallocationPipeline(
+            pm, bufferization::DeallocationOptions{false});
+      });
 
   PassPipelineRegistration<> bufferOptPipeline(
       "plan-buffer-opt-pipeline", "perform post-bufferization optimizations",

diff --git a/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/StablehloClustering.cpp b/mlir-tensorrt/compiler/lib/Dialect/Plan/Transforms/StablehloClustering.cpp
@@ -280,7 +280,7 @@ class StablehloClusteringPass
       if (failed(
               applyClusteringToFunc(rewriter, func, solver, schedule,
                                     StablehloClusteringPassOptions{
-                                        entrypoint, enableNonDPSReturns,
+                                        entrypoint, forceEntrypointsReturnAllocs,
                                         /*disableCreateShapeFuncPass=*/false})))
         return signalPassFailure();
     }

diff --git a/mlir-tensorrt/compiler/test/Dialect/Plan/create-closed-regions.mlir b/mlir-tensorrt/compiler/test/Dialect/Plan/create-closed-regions.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-tensorrt-opt %s -plan-create-closed-regions -split-input-file | FileCheck %s
 // RUN: mlir-tensorrt-opt %s -plan-create-closed-regions=test-pre-walk-order=true -split-input-file | FileCheck %s
-// RUN: mlir-tensorrt-opt %s -plan-create-closed-regions=enable-non-dps-returns=true -split-input-file | FileCheck %s --check-prefix=CHECK-ALLOC
+// RUN: mlir-tensorrt-opt %s -plan-create-closed-regions=force-entrypoints-return-allocs=true -split-input-file | FileCheck %s --check-prefix=CHECK-ALLOC
 
 func.func @test_simple_static(%arg0: tensor<10xf32>, %arg1: tensor<10xf32>) -> tensor<10xf32> {
   %0 = plan.inline_group target(#plan.tensorrt_cluster<disallow_shape_tensor_calculations = false, benefit = 1>) -> tensor<10xf32> {