[xla:emitters] tag XLA, XLA:CPU and XLA:GPU dialects as non-prod-comp…

…atible This paves the way for XLA:CPU fusion emitters. Note that XLA:CPU is non-prod-compatible, whereas XLA:GPU is not. The CPU fusion emitters will depend on the XLA, XLA:CPU and XLA:GPU dialects, and given that the emitters' dependents in XLA:CPU are non-prod-compatible, the three dialects have to be as well. XLA:CPU passes also have to be tagged. Crucially, XLA:GPU passes are not used by any of the above dialects nor by XLA:CPU passes, so XLA:GPU remains essentially untouched; we just tag the XLA:GPU dialect. Some common libraries in xla/codegen/emitters are also tagged. PiperOrigin-RevId: 721954339
openxla · Feb 1, 2025 · 170e331 · 170e331
1 parent c1ef7cc
commit 170e331
Show file tree

Hide file tree

Showing 38 changed files with 360 additions and 619 deletions.
diff --git a/xla/backends/cpu/codegen/emitters/ir/BUILD b/xla/backends/cpu/codegen/emitters/ir/BUILD
@@ -1,4 +1,6 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+load("//tensorflow:tensorflow.google.bzl", "get_compatible_with_portable")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -16,6 +18,7 @@ package_group(
 td_library(
     name = "xla_cpu_td_files",
     srcs = glob(["*.td"]),
+    compatible_with = get_compatible_with_portable(),
     includes = ["."],
     deps = [
         "@llvm-project//mlir:BuiltinDialectTdFiles",
@@ -25,6 +28,7 @@ td_library(
 
 gentbl_cc_library(
     name = "xla_cpu_dialect_inc_gen",
+    compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
     tbl_outs = [
         (
@@ -43,6 +47,7 @@ gentbl_cc_library(
 
 gentbl_cc_library(
     name = "xla_cpu_types_inc_gen",
+    compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
     tbl_outs = [
         (
@@ -67,6 +72,7 @@ gentbl_cc_library(
 
 gentbl_cc_library(
     name = "xla_cpu_ops_inc_gen",
+    compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
     tbl_outs = [
         (

diff --git a/xla/backends/cpu/codegen/emitters/transforms/BUILD b/xla/backends/cpu/codegen/emitters/transforms/BUILD
@@ -1,4 +1,6 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//tensorflow:tensorflow.google.bzl", "get_compatible_with_portable")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -15,6 +17,7 @@ package_group(
 
 gentbl_cc_library(
     name = "passes_inc_gen",
+    compatible_with = get_compatible_with_portable(),
     tbl_outs = [
         (
             [

diff --git a/xla/backends/gpu/codegen/emitters/emitter_base.cc b/xla/backends/gpu/codegen/emitters/emitter_base.cc
@@ -574,33 +574,33 @@ absl::Status EmitterBase::RunPassPipeline(
 }
 
 void AddXlaGpuOpsOptimizationPasses(mlir::OpPassManager& pm) {
-  pm.addNestedPass<FuncOp>(CreateSimplifyArithPass());
+  pm.addNestedPass<FuncOp>(emitters::CreateSimplifyArithPass());
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
-  pm.addPass(CreateEraseDeadFunctionsPass());
+  pm.addPass(emitters::CreateEraseDeadFunctionsPass());
   pm.addPass(mlir::createCSEPass());
 }
 
 void AddLoopTransformationPasses(mlir::OpPassManager& pm,
                                  const se::DeviceDescription& device) {
   pm.addNestedPass<FuncOp>(
-      CreateLowerXlaGpuToScfPass(device.threads_per_warp()));
+      emitters::CreateLowerXlaToScfPass(device.threads_per_warp()));
   pm.addNestedPass<FuncOp>(CreateFuseLoopsPass());
   pm.addPass(mlir::createInlinerPass({}, [&](mlir::OpPassManager& pm) {
     // CSE after inlining because inlining can introduce duplicates.
     pm.addPass(mlir::createCSEPass());
   }));
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
-  pm.addNestedPass<FuncOp>(CreatePeelLoopsPass());
-  pm.addNestedPass<FuncOp>(CreateLowerXlaGpuLoopsToScfPass());
+  pm.addNestedPass<FuncOp>(emitters::CreatePeelLoopsPass());
+  pm.addNestedPass<FuncOp>(emitters::CreateLowerXlaLoopsToScfPass());
   pm.addPass(mlir::mhlo::createConvertToSignlessPass());
-  pm.addPass(CreatePropagateSliceIndicesPass());
+  pm.addPass(emitters::CreatePropagateSliceIndicesPass());
   pm.addPass(emitters::CreateFlattenTensorsPass());
   // We need LICM before unswitching loops, because our loop unswitcher only
   // detects for loops with a single if inside them.
   pm.addPass(mlir::createLoopInvariantCodeMotionPass());
-  pm.addNestedPass<FuncOp>(CreateUnswitchLoopsPass());
+  pm.addNestedPass<FuncOp>(emitters::CreateUnswitchLoopsPass());
   // We need LICM again after unswitching, because that can introduce new
   // opportunities for LICM. This would not be necessary if LICM also moved
   // instructions over ifs.
@@ -613,17 +613,17 @@ void AddLoopTransformationPasses(mlir::OpPassManager& pm,
 
 void AddLoweringPasses(mlir::OpPassManager& pm,
                        const se::DeviceDescription& device) {
-  pm.addNestedPass<FuncOp>(CreateConvertPureCallOpsPass());
+  pm.addNestedPass<FuncOp>(emitters::CreateConvertPureCallOpsPass());
   pm.addPass(emitters::CreateLowerTensorsPass(device));
   pm.addPass(mlir::createConvertComplexToStandardPass());
-  pm.addPass(CreateMergePointersToSameSlicePass());
+  pm.addPass(emitters::CreateMergePointersToSameSlicePass());
 
   // LowerTensors creates new affine.apply ops. Fold and CSE them so
   // simplify-affine has maximally folded expressions to work with.
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
-  pm.addNestedPass<FuncOp>(CreateSimplifyArithPass());
-  pm.addPass(CreateSimplifyAffinePass());
+  pm.addNestedPass<FuncOp>(emitters::CreateSimplifyArithPass());
+  pm.addPass(emitters::CreateSimplifyAffinePass());
   pm.addPass(CreateConvertIndexTypePass());
   // simplify-affine lowers most affine.apply ops, but if it can't prove a
   // division or modulo is unsigned, affine.apply ops will remain.

diff --git a/xla/backends/gpu/codegen/emitters/ir/BUILD b/xla/backends/gpu/codegen/emitters/ir/BUILD
@@ -1,4 +1,6 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+load("//tensorflow:tensorflow.google.bzl", "get_compatible_with_portable")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -16,6 +18,7 @@ package_group(
 td_library(
     name = "xla_gpu_td_files",
     srcs = glob(["*.td"]),
+    compatible_with = get_compatible_with_portable(),
     includes = ["."],
     deps = [
         "//xla/codegen/emitters/ir:xla_td_files",
@@ -30,6 +33,7 @@ td_library(
 
 gentbl_cc_library(
     name = "xla_gpu_dialect_inc_gen",
+    compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
     tbl_outs = [
         (
@@ -48,6 +52,7 @@ gentbl_cc_library(
 
 gentbl_cc_library(
     name = "xla_gpu_ops_inc_gen",
+    compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
     tbl_outs = [
         (
@@ -66,6 +71,7 @@ gentbl_cc_library(
 
 gentbl_cc_library(
     name = "xla_gpu_attrs_inc_gen",
+    compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
     tbl_outs = [
         (
@@ -98,6 +104,7 @@ gentbl_cc_library(
 
 gentbl_cc_library(
     name = "xla_gpu_types_inc_gen",
+    compatible_with = get_compatible_with_portable(),
     strip_include_prefix = ".",
     tbl_outs = [
         (

diff --git a/xla/backends/gpu/codegen/emitters/tests/reduce_column_small/f32_2.hlo b/xla/backends/gpu/codegen/emitters/tests/reduce_column_small/f32_2.hlo
@@ -1,5 +1,5 @@
 // RUN: test_correctness %s --bijection_inputs=reduce:0 --bijection_outputs=reduce
-// RUN: fusion_to_mlir %s | emitters_opt -cse -xla-gpu-simplify-arith -canonicalize | FileCheck %s --dump-input=always
+// RUN: fusion_to_mlir %s | emitters_opt -cse -xla-simplify-arith -canonicalize | FileCheck %s --dump-input=always
 
 add {
   %p0 = f32[] parameter(0)

diff --git a/xla/backends/gpu/codegen/emitters/tests/reduce_column_small/f32_32_v2.hlo b/xla/backends/gpu/codegen/emitters/tests/reduce_column_small/f32_32_v2.hlo
@@ -1,5 +1,5 @@
 
-// RUN: fusion_to_mlir %s | emitters_opt -cse -xla-gpu-simplify-arith -canonicalize | FileCheck %s --dump-input=always
+// RUN: fusion_to_mlir %s | emitters_opt -cse -xla-simplify-arith -canonicalize | FileCheck %s --dump-input=always
 
 add {
   %p0 = f32[] parameter(0)

diff --git a/xla/backends/gpu/codegen/emitters/tests/reduce_column_small/f32_8_v2.hlo b/xla/backends/gpu/codegen/emitters/tests/reduce_column_small/f32_8_v2.hlo
@@ -1,4 +1,4 @@
-// RUN: fusion_to_mlir %s | emitters_opt -cse -xla-gpu-simplify-arith -canonicalize | FileCheck %s --dump-input=always
+// RUN: fusion_to_mlir %s | emitters_opt -cse -xla-simplify-arith -canonicalize | FileCheck %s --dump-input=always
 // RUN: test_correctness %s --bijection_inputs=reduce:0 --bijection_outputs=reduce
 
 add {

diff --git a/xla/backends/gpu/codegen/emitters/tests/reduce_column_small/s8_f32_32_v4.hlo b/xla/backends/gpu/codegen/emitters/tests/reduce_column_small/s8_f32_32_v4.hlo
@@ -1,4 +1,4 @@
-// RUN: fusion_to_mlir %s | emitters_opt -cse -xla-gpu-simplify-arith -canonicalize | FileCheck %s --dump-input=always
+// RUN: fusion_to_mlir %s | emitters_opt -cse -xla-simplify-arith -canonicalize | FileCheck %s --dump-input=always
 
 add {
   %p0 = f32[] parameter(0)

diff --git a/xla/backends/gpu/codegen/emitters/transforms/BUILD b/xla/backends/gpu/codegen/emitters/transforms/BUILD
@@ -40,17 +40,8 @@ cc_library(
     srcs = [
         "convert_float_nvidia.cc",
         "convert_index_type.cc",
-        "convert_xla_gpu_pure_call_ops.cc",
-        "erase_dead_functions.cc",
         "fuse_loops.cc",
-        "lower_xla_gpu_to_scf.cc",
-        "merge_pointers_to_same_slice.cc",
         "optimize_loops.cc",
-        "peel_loops.cc",
-        "propagate_slice_indices.cc",
-        "simplify_affine.cc",
-        "simplify_arith.cc",
-        "unswitch_loops.cc",
         "vectorize_loads_stores.cc",
     ],
     hdrs = ["passes.h"],
@@ -61,7 +52,6 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
-        "//xla/codegen/emitters:elemental_hlo_to_mlir",
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/emitters/transforms:atomic_rmw_utils",
         "//xla/hlo/analysis:indexing_analysis",
@@ -74,7 +64,6 @@ cc_library(
         "//xla/stream_executor:semantic_version",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",

diff --git a/xla/backends/gpu/codegen/emitters/transforms/passes.h b/xla/backends/gpu/codegen/emitters/transforms/passes.h
@@ -37,18 +37,8 @@ std::unique_ptr<mlir::Pass> CreateConvertFloatNvidiaPass();
 std::optional<std::unique_ptr<mlir::Pass>> MaybeCreateConvertFloatNvidiaPass(
     const se::DeviceDescription& device_description);
 std::unique_ptr<mlir::Pass> CreateConvertIndexTypePass();
-std::unique_ptr<mlir::Pass> CreateConvertPureCallOpsPass();
-std::unique_ptr<mlir::Pass> CreateEraseDeadFunctionsPass();
-std::unique_ptr<mlir::Pass> CreateLowerXlaGpuToScfPass(int64_t warp_size = 32);
-std::unique_ptr<mlir::Pass> CreateLowerXlaGpuLoopsToScfPass();
-std::unique_ptr<mlir::Pass> CreateMergePointersToSameSlicePass();
 std::unique_ptr<mlir::Pass> CreateOptimizeLoopsPass();
 std::unique_ptr<mlir::Pass> CreateFuseLoopsPass();
-std::unique_ptr<mlir::Pass> CreatePeelLoopsPass();
-std::unique_ptr<mlir::Pass> CreatePropagateSliceIndicesPass();
-std::unique_ptr<mlir::Pass> CreateSimplifyAffinePass();
-std::unique_ptr<mlir::Pass> CreateSimplifyArithPass();
-std::unique_ptr<mlir::Pass> CreateUnswitchLoopsPass();
 std::unique_ptr<mlir::Pass> CreateVectorizeLoadsAndStoresPass(
     const std::string& gpu_device_info = "");
 std::unique_ptr<mlir::Pass> CreateVectorizeLoadsAndStoresPass(