From 07140b47100caf3d31489b5a6441a242b72ff235 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Fri, 7 Jun 2024 17:13:21 +0200
Subject: [PATCH 1/7] Add operator AddAdd, MulMul

---
 operators/cuda/add_mul.h        |  37 ++++++++
 operators/cuda/add_mul_impl.cu  | 147 ++++++++++++++++++++++++++++----
 operators/cuda/add_mul_impl.cuh |   6 +-
 operators/cuda/cuda_ops.cc      |  10 +++
 test/cuda/test_cudaops.py       |  83 +++++++++++++++++-
 5 files changed, 264 insertions(+), 19 deletions(-)
diff --git a/operators/cuda/add_mul.h b/operators/cuda/add_mul.h
index 5b8f36f21..3f6cc1e64 100644
--- a/operators/cuda/add_mul.h
+++ b/operators/cuda/add_mul.h
@@ -43,4 +43,41 @@ struct AddOrMulSharedInput {
   }
 };
 
+template <typename T, bool addition>
+struct AddOrMulTwice {
+  template <typename TDict>
+  OrtxStatus OnModelAttach(const TDict& /*dict*/) {
+    return {};
+  }
+  OrtxStatus Compute(Ort::Custom::CUDAKernelContext* ctx,
+                     const ortc::Tensor<T>& tensor_a,
+                     const ortc::Tensor<T>& tensor_b,
+                     const ortc::Tensor<T>& tensor_c,
+                     ortc::Tensor<T>& output) const {
+    const T* input_data_a = tensor_a.Data();
+    const T* input_data_b = tensor_b.Data();
+    const T* input_data_c = tensor_c.Data();
+
+    auto length_a = tensor_a.NumberOfElement();
+    auto length_b = tensor_b.NumberOfElement();
+    auto length_c = tensor_c.NumberOfElement();
+
+    T* output_data_ab = output_ab.Allocate(
+      length_a <= length_b
+        ? lenght_c <= length_b ? tensor_b.Shape() : tensor_c.Shape()
+        : lenght_a <= length_b ? tensor_b.Shape() : tensor_a.Shape());
+
+    if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
+      return {};
+    }
+    LaunchAddOrMulTwiceKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
+                                       input_data_a, input_data_b, input_data_c,
+                                       output_data,
+                                       length_a, length_b, length_c,
+                                       addition);
+    return {};
+  }
+};
+
+
 }  // namespace contrib
\ No newline at end of file
diff --git a/operators/cuda/add_mul_impl.cu b/operators/cuda/add_mul_impl.cu
index 85f55bc77..0373ddf28 100644
--- a/operators/cuda/add_mul_impl.cu
+++ b/operators/cuda/add_mul_impl.cu
@@ -12,12 +12,12 @@
 
 using namespace Ort::Custom;
 
-__device__ __forceinline__ void _add3_op(float* ab, float* ac, const float a, const float b, const float c) {
+__device__ __forceinline__ void _add3_2_op(float* ab, float* ac, const float a, const float b, const float c) {
   *ab = a + b;
   *ac = a + c;
 }
 
-__device__ __forceinline__ void _add3_op(half* ab, half* ac, const half a, const half b, const half c) {
+__device__ __forceinline__ void _add3_2_op(half* ab, half* ac, const half a, const half b, const half c) {
 #if __CUDA_ARCH__ < 700
   *ab = __float2half(__half2float(a) + __half2float(b));
   *ac = __float2half(__half2float(a) + __half2float(c));
@@ -27,12 +27,12 @@ __device__ __forceinline__ void _add3_op(half* ab, half* ac, const half a, const
 #endif
 }
 
-__device__ __forceinline__ void _mul3_op(float* ab, float* ac, const float a, const float b, const float c) {
+__device__ __forceinline__ void _mul3_2_op(float* ab, float* ac, const float a, const float b, const float c) {
   *ab = a * b;
   *ac = a * c;
 }
 
-__device__ __forceinline__ void _mul3_op(half* ab, half* ac, const half a, const half b, const half c) {
+__device__ __forceinline__ void _mul3_2_op(half* ab, half* ac, const half a, const half b, const half c) {
 #if __CUDA_ARCH__ < 700
   *ab = __float2half(__half2float(a) * __half2float(b));
   *ac = __float2half(__half2float(a) * __half2float(c));
@@ -45,21 +45,21 @@ __device__ __forceinline__ void _mul3_op(half* ab, half* ac, const half a, const
 template <typename T>
 struct Mul3SharedOp {
   __device__ __forceinline__ void operator()(T* ab, T* ac, const T a, const T b, const T c) const {
-    _mul3_op(ab, ac, a, b, c);
+    _mul3_2_op(ab, ac, a, b, c);
   }
 };
 
 template <typename T>
 struct Add3SharedOp {
   __device__ __forceinline__ void operator()(T* ab, T* ac, const T a, const T b, const T c) const {
-    _add3_op(ab, ac, a, b, c);
+    _add3_2_op(ab, ac, a, b, c);
   }
 };
 
 template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void AddMulKernel(T* output_ab, T* output_ac, const T* pA, const T* pB,
-                             const T* pC, CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC,
-                             CUDA_LONG N, const TFunc func) {
+__global__ void AddMulSharedInputKernel(T* output_ab, T* output_ac, const T* pA, const T* pB,
+                                        const T* pC, CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC,
+                                        CUDA_LONG N, const TFunc func) {
   CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
   CUDA_LONG id = start;
 #pragma unroll
@@ -89,14 +89,14 @@ cudaError_t _LaunchAddOrMulSharedInputKernel(cudaStream_t stream,
   using TT = typename contrib::CudaT<T>::MappedType;
 
   if (addition) {
-    AddMulKernel<TT, Add3SharedOp<TT>, num_threads_per_block, num_elements_per_thread>
+    AddMulSharedInputKernel<TT, Add3SharedOp<TT>, num_threads_per_block, num_elements_per_thread>
         <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
             reinterpret_cast<TT*>(output_ab), reinterpret_cast<TT*>(output_ac),
             reinterpret_cast<const TT*>(pA), reinterpret_cast<const TT*>(pB), reinterpret_cast<const TT*>(pC), static_cast<CUDA_LONG>(countA),
             static_cast<CUDA_LONG>(countB), static_cast<CUDA_LONG>(countC),
             static_cast<CUDA_LONG>(max_count), Add3SharedOp<TT>());
   } else {
-    AddMulKernel<TT, Mul3SharedOp<TT>, num_threads_per_block, num_elements_per_thread>
+    AddMulSharedInputKernel<TT, Mul3SharedOp<TT>, num_threads_per_block, num_elements_per_thread>
         <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
             reinterpret_cast<TT*>(output_ab), reinterpret_cast<TT*>(output_ac),
             reinterpret_cast<const TT*>(pA), reinterpret_cast<const TT*>(pB), reinterpret_cast<const TT*>(pC), static_cast<CUDA_LONG>(countA),
@@ -107,15 +107,132 @@ cudaError_t _LaunchAddOrMulSharedInputKernel(cudaStream_t stream,
 }
 
 template <>
-cudaError_t LaunchAddOrMulSharedInputKernel<float>(cudaStream_t stream, const float* input_a, const float* input_b, const float* input_c,
+cudaError_t LaunchAddOrMulSharedInputKernel<float>(cudaStream_t stream,
+                                                   const float* input_a, const float* input_b, const float* input_c,
                                                    float* output_ab, float* output_ac,
                                                    int64_t length_a, int64_t length_b, int64_t length_c, bool addition) {
-  return _LaunchAddOrMulSharedInputKernel(stream, input_a, input_b, input_c, output_ab, output_ac, length_a, length_b, length_c, addition);
+  return _LaunchAddOrMulSharedInputKernel(stream, input_a, input_b, input_c,
+                                          output_ab, output_ac,
+                                          length_a, length_b, length_c, addition);
 }
 
 template <>
-cudaError_t LaunchAddOrMulSharedInputKernel<ortc::MFloat16>(cudaStream_t stream, const ortc::MFloat16* input_a, const ortc::MFloat16* input_b, const ortc::MFloat16* input_c,
+cudaError_t LaunchAddOrMulSharedInputKernel<ortc::MFloat16>(cudaStream_t stream,
+                                                            const ortc::MFloat16* input_a, const ortc::MFloat16* input_b, const ortc::MFloat16* input_c,
                                                             ortc::MFloat16* output_ab, ortc::MFloat16* output_ac,
                                                             int64_t length_a, int64_t length_b, int64_t length_c, bool addition) {
-  return _LaunchAddOrMulSharedInputKernel(stream, input_a, input_b, input_c, output_ab, output_ac, length_a, length_b, length_c, addition);
+  return _LaunchAddOrMulSharedInputKernel(stream, input_a, input_b, input_c,
+                                          output_ab, output_ac,
+                                          length_a, length_b, length_c, addition);
 }
+
+__device__ __forceinline__ void _add3_op(float *address, const float a, const float b,
+                                         const float c) {
+  *address = a + b + c;
+}
+
+__device__ __forceinline__ void _add3_op(half *address, const half a, const half b,
+                                         const half c) {
+#if __CUDA_ARCH__ < 700
+  *address = __float2half(__half2float(a) + __half2float(b) + __half2float(c));
+#else
+  *address = a + b + c;
+#endif
+}
+
+__device__ __forceinline__ void _mul3_op(float *address, const float a, const float b,
+                                         const float c) {
+  *address = a * b * c;
+}
+
+__device__ __forceinline__ void _mul3_op(half *address, const half a, const half b,
+                                         const half c) {
+#if __CUDA_ARCH__ < 700
+  *address = __float2half(__half2float(a) * __half2float(b) * __half2float(c));
+#else
+  *address = a * b * c;
+#endif
+}
+
+template <typename T> struct Mul3Op {
+  __device__ __inline__ void operator()(T *address, const T a, const T b, const T c) const {
+    _mul3_op(address, a, b, c);
+  }
+};
+
+template <typename T> struct Add3Op {
+  __device__ __inline__ void operator()(T *address, const T a, const T b, const T c) const {
+    _add3_op(address, a, b, c);
+  }
+};
+
+template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void AddMulTwiceKernel(T* output, const T* pA, const T* pB,
+                                  const T* pC, CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC,
+                                  CUDA_LONG N, const TFunc func) {
+  CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+  CUDA_LONG id = start;
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      func(output_ab, pA[id % nA], pB[id % nB], pC[id % nC]);
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+template <typename T>
+cudaError_t _LaunchAddOrMulTwiceKernel(cudaStream_t stream,
+                                       const T* pA, const T* pB, const T* pC,
+                                       T* output,
+                                       int64_t countA, int64_t countB, int64_t countC, bool addition) {
+  int64_t max_count = std::max(std::max(countA, countB), countC);
+  if (max_count == 0)  // special case where there's a dim value of 0 in the output shape
+    return cudaGetLastError();
+
+  const int num_elements_per_thread = 4;
+  const int num_threads_per_block = 256;
+  const int num_el_th = num_threads_per_block * num_elements_per_thread;
+
+  int blocksPerGrid = (max_count + num_el_th - 1) / num_el_th;
+
+  using TT = typename contrib::CudaT<T>::MappedType;
+
+  if (addition) {
+    AddMulTwiceKernel<TT, Add3Op<TT>, num_threads_per_block, num_elements_per_thread>
+        <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+            reinterpret_cast<TT*>(output), 
+            reinterpret_cast<const TT*>(pA), reinterpret_cast<const TT*>(pB), reinterpret_cast<const TT*>(pC),
+            static_cast<CUDA_LONG>(countA), static_cast<CUDA_LONG>(countB), static_cast<CUDA_LONG>(countC),
+            static_cast<CUDA_LONG>(max_count), Add3SharedOp<TT>());
+  } else {
+    AddMulTwiceKernel<TT, Mul3Op<TT>, num_threads_per_block, num_elements_per_thread>
+        <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+            reinterpret_cast<TT*>(output),
+            reinterpret_cast<const TT*>(pA), reinterpret_cast<const TT*>(pB), reinterpret_cast<const TT*>(pC), static_cast<CUDA_LONG>(countA),
+            static_cast<CUDA_LONG>(countB), static_cast<CUDA_LONG>(countC),
+            static_cast<CUDA_LONG>(max_count), Mul3SharedOp<TT>());
+  }
+  return cudaGetLastError();
+}
+
+template <>
+cudaError_t LaunchAddOrMulSharedInputKernel<float>(cudaStream_t stream,
+                                                   const float* input_a, const float* input_b, const float* input_c,
+                                                   float* output,
+                                                   int64_t length_a, int64_t length_b, int64_t length_c, bool addition) {
+  return _LaunchAddOrMulSharedInputKernel(stream, input_a, input_b, input_c,
+                                          output,
+                                          length_a, length_b, length_c, addition);
+}
+
+template <>
+cudaError_t LaunchAddOrMulSharedInputKernel<ortc::MFloat16>(cudaStream_t stream,
+                                                            const ortc::MFloat16* input_a, const ortc::MFloat16* input_b, const ortc::MFloat16* input_c,
+                                                            ortc::MFloat16* output,
+                                                            int64_t length_a, int64_t length_b, int64_t length_c, bool addition) {
+  return _LaunchAddOrMulSharedInputKernel(stream, input_a, input_b, input_c,
+                                          output,
+                                          length_a, length_b, length_c, addition);
+}
+
diff --git a/operators/cuda/add_mul_impl.cuh b/operators/cuda/add_mul_impl.cuh
index 9bf3ad853..6fdf9369e 100644
--- a/operators/cuda/add_mul_impl.cuh
+++ b/operators/cuda/add_mul_impl.cuh
@@ -8,4 +8,8 @@
 template <typename T>
 cudaError_t LaunchAddOrMulSharedInputKernel(cudaStream_t stream, const T* input_a, const T* input_b, const T* input_c,
                                             T* output_ab, T* output_ac,
-                                            int64_t length_a, int64_t length_b, int64_t length_c, bool addition);
\ No newline at end of file
+                                            int64_t length_a, int64_t length_b, int64_t length_c, bool addition);
+
+template <typename T>
+cudaError_t LaunchAddOrMulTwiceKernel(cudaStream_t stream, const T* input_a, const T* input_b, const T* input_c,
+                                      T* output, int64_t length_a, int64_t length_b, int64_t length_c, bool addition);
\ No newline at end of file
diff --git a/operators/cuda/cuda_ops.cc b/operators/cuda/cuda_ops.cc
index f8269302a..8c4808a5a 100644
--- a/operators/cuda/cuda_ops.cc
+++ b/operators/cuda/cuda_ops.cc
@@ -14,9 +14,15 @@ FxLoadCustomOpFactory LoadCustomOpClasses_Contrib = []() -> CustomOpArray& {
   using AddSharedInputFloat32Type = typename contrib::AddOrMulSharedInput<float, true>;
   using MulSharedInputFloat32Type = typename contrib::AddOrMulSharedInput<float, false>;
 
+  using AddTwiceFloat32Type = typename contrib::AddOrMulTwice<float, true>;
+  using MulTwiceFloat32Type = typename contrib::AddOrMulTwice<float, false>;
+
 #if ORT_API_VERSION >= 16
   using AddSharedInputFloat16Type = typename contrib::AddOrMulSharedInput<ortc::MFloat16, true>;
   using MulSharedInputFloat16Type = typename contrib::AddOrMulSharedInput<ortc::MFloat16, false>;
+
+  using AddTwiceFloat16Type = typename contrib::AddOrMulTwice<ortc::MFloat16, true>;
+  using MulTwiceFloat16Type = typename contrib::AddOrMulTwice<ortc::MFloat16, false>;
 #endif
 
 
@@ -25,15 +31,19 @@ FxLoadCustomOpFactory LoadCustomOpClasses_Contrib = []() -> CustomOpArray& {
 #ifdef USE_CUDA
       ,
       CustomCudaStructV2("AddSharedInput", AddSharedInputFloat32Type),
+      CustomCudaStructV2("AddTwice", AddTwiceFloat32Type),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<float>),
       CustomCudaStructV2("MulSharedInput", MulSharedInputFloat32Type),
+      CustomCudaStructV2("MulTwice", MulTwiceFloat32Type),
       CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<float>),
 #if ORT_API_VERSION >= 16
 
       CustomCudaStructV2("AddSharedInput", AddSharedInputFloat16Type),
+      CustomCudaStructV2("AddTwice", AddTwiceFloat16Type),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<ortc::MFloat16>),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<ortc::BFloat16>),
       CustomCudaStructV2("MulSharedInput", MulSharedInputFloat16Type),
+      CustomCudaStructV2("MulTwice", MulTwiceFloat16Type),
       CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<ortc::MFloat16>)
 #endif
 #endif
diff --git a/test/cuda/test_cudaops.py b/test/cuda/test_cudaops.py
index df1ab2c47..7955795db 100644
--- a/test/cuda/test_cudaops.py
+++ b/test/cuda/test_cudaops.py
@@ -22,6 +22,83 @@ def _run(self, X):
 
 
 class TestCudaOps(unittest.TestCase):
+    def _addaddmulmul_cuda(self, itype, op_type, broad=False):
+        model1 = helper.make_model(
+            helper.make_graph(
+                [
+                    helper.make_node(op_type, ["X", "Y"], ["xy"]),
+                    helper.make_node(op_type, ["xy", "Z"], ["final"]),
+                ],
+                "nd",
+                [
+                    helper.make_tensor_value_info("X", itype, [None, None, None]),
+                    helper.make_tensor_value_info("Y", itype, [None, None, None]),
+                    helper.make_tensor_value_info("Z", itype, [None, None, None]),
+                ],
+                [helper.make_tensor_value_info("final", itype, [None, None, None])],
+            ),
+            opset_imports=[helper.make_opsetid("", 18)],
+            ir_version=9,
+        )
+
+        model2 = helper.make_model(
+            helper.make_graph(
+                [
+                    helper.make_node(
+                        f"{op_type}{op_type}",
+                        ["X", "Y", "Z"],
+                        ["final"],
+                        domain="ai.onnx.contrib",
+                    )
+                ],
+                "nd",
+                [
+                    helper.make_tensor_value_info("X", itype, [None, None, None]),
+                    helper.make_tensor_value_info("Y", itype, [None, None, None]),
+                    helper.make_tensor_value_info("Z", itype, [None, None, None]),
+                ],
+                [helper.make_tensor_value_info("final", itype, [None, None, None])],
+            ),
+            opset_imports=[
+                helper.make_opsetid("", 18),
+                helper.make_opsetid("ai.onnx.contrib", 1),
+            ],
+            ir_version=9,
+        )
+
+        dtype = np.float32 if itype == TensorProto.FLOAT else np.float16
+        shapex = (1, 2, 3) if broad else (3, 2, 3)
+        shapey = (3, 2, 3)
+        shapez = (1, 2, 3) if broad else (3, 2, 3)
+        x = (np.arange(np.prod(shapex)) + 1).reshape(shapex).astype(dtype)
+        y = (np.arange(np.prod(shapey)) + 1).reshape(shapey).astype(dtype)
+        z = (np.arange(np.prod(shapez)) + 1).reshape(shapez).astype(dtype)
+
+        feeds1 = dict(X=x, Y=y, Z=z)
+        ref = ReferenceEvaluator(model1)
+        expected = ref.run(None, feeds1)[0]
+
+        opts = _ort.SessionOptions()
+        opts.register_custom_ops_library(_get_library_path())
+        sess = _ort.InferenceSession(model2.SerializeToString(), opts, providers=["CUDAExecutionProvider"])
+        got = sess.run(None, feeds1)[0]
+        assert_almost_equal(expected, got)
+
+    @unittest.skipIf(not has_cuda(), reason="cuda not available")
+    def test_mulmul_cuda(self):
+        self._addaddmulmul_cuda(TensorProto.FLOAT, "Mul")
+        self._addaddmulmul_cuda(TensorProto.FLOAT16, "Mul")
+
+    @unittest.skipIf(not has_cuda(), reason="cuda not available")
+    def test_mulmul_cuda_broadcast(self):
+        self._addaddmulmul_cuda(TensorProto.FLOAT, "Mul", True)
+        self._addaddmulmul_cuda(TensorProto.FLOAT16, "Mul", True)
+
+    @unittest.skipIf(not has_cuda(), reason="cuda not available")
+    def test_addadd_cuda(self):
+        self._addaddmulmul_cuda(TensorProto.FLOAT, "Add")
+        self._addaddmulmul_cuda(TensorProto.FLOAT16, "Add")
+
     @staticmethod
     def _create_negpos_test_model(domain="ai.onnx.contrib"):
         nodes = [
@@ -151,7 +228,7 @@ def test_cuda_negxplus1(self):
         self._negxplus1_cuda(TensorProto.FLOAT16)
 
     def _addmul_shared_input_cuda(self, itype, op_type, shapea=(3, 2, 3), shapeb=(3, 2, 3), shapec=(3, 2, 3)):
-        from onnx_extended.ortops.optim.cuda import get_ort_ext_libs
+        from ai.onnx.contrib import get_ort_ext_libs
 
         model1 = helper.make_model(
             helper.make_graph(
@@ -181,7 +258,7 @@ def _addmul_shared_input_cuda(self, itype, op_type, shapea=(3, 2, 3), shapeb=(3,
                         f"{op_type}SharedInput",
                         ["X", "Y", "Z"],
                         ["XY", "XZ"],
-                        domain="onnx_extended.ortops.optim.cuda",
+                        domain="ai.onnx.contrib",
                     )
                 ],
                 "nd",
@@ -197,7 +274,7 @@ def _addmul_shared_input_cuda(self, itype, op_type, shapea=(3, 2, 3), shapeb=(3,
             ),
             opset_imports=[
                 helper.make_opsetid("", 18),
-                helper.make_opsetid("onnx_extended.ortops.optim.cuda", 1),
+                helper.make_opsetid("ai.onnx.contrib", 1),
             ],
             ir_version=9,
         )

From 31d8febd47c607eaa6b7386bfbfb75e297b78ed5 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sat, 8 Jun 2024 21:36:27 +0200
Subject: [PATCH 2/7] add_mul

---
 operators/cuda/add_mul.h        |  84 +++++++++++++--
 operators/cuda/add_mul_impl.cu  | 181 ++++++++++++++++++++++++++++++--
 operators/cuda/add_mul_impl.cuh |  13 ++-
 operators/cuda/cuda_ops.cc      |  12 ++-
 test/cuda/test_cudaops.py       |  84 +++++++++++++++
 5 files changed, 355 insertions(+), 19 deletions(-)

diff --git a/operators/cuda/add_mul.h b/operators/cuda/add_mul.h
index 3f6cc1e64..43a1b97e2 100644
--- a/operators/cuda/add_mul.h
+++ b/operators/cuda/add_mul.h
@@ -63,21 +63,91 @@ struct AddOrMulTwice {
     auto length_c = tensor_c.NumberOfElement();
 
     T* output_data_ab = output_ab.Allocate(
-      length_a <= length_b
-        ? lenght_c <= length_b ? tensor_b.Shape() : tensor_c.Shape()
-        : lenght_a <= length_b ? tensor_b.Shape() : tensor_a.Shape());
+        length_a <= length_b
+            ? lenght_c <= length_b ? tensor_b.Shape() : tensor_c.Shape()
+        : lenght_a <= length_b ? tensor_b.Shape()
+                               : tensor_a.Shape());
 
     if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
       return {};
     }
     LaunchAddOrMulTwiceKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
-                                       input_data_a, input_data_b, input_data_c,
-                                       output_data,
-                                       length_a, length_b, length_c,
-                                       addition);
+                                 input_data_a, input_data_b, input_data_c,
+                                 output_data,
+                                 length_a, length_b, length_c,
+                                 addition);
     return {};
   }
 };
 
+template <typename T, bool addition_first>
+struct AddAndMul {
+  template <typename TDict>
+  OrtxStatus OnModelAttach(const TDict& dict) {
+    return {};
+  }
+  OrtxStatus Compute(Ort::Custom::CUDAKernelContext* ctx,
+                     const ortc::Tensor<T>& tensor_a,
+                     const ortc::Tensor<T>& tensor_b,
+                     const ortc::Tensor<T>& tensor_c,
+                     ortc::Tensor<T>& output) const {
+    const T* input_data_a = tensor_a.Data();
+    const T* input_data_b = tensor_b.Data();
+    const T* input_data_c = tensor_c.Data();
+
+    auto length_a = tensor_a.NumberOfElement();
+    auto length_b = tensor_b.NumberOfElement();
+    auto length_c = tensor_c.NumberOfElement();
+    if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
+      return {};
+    }
+
+    std::vector<int64_t> dimsA = tensor_a.Shape();
+    std::vector<int64_t> dimsB = tensor_b.Shape();
+    std::vector<int64_t> dimsC = tensor_c.Shape();
+
+    auto max_length = std::max(length_a, std::max(length_b, length_c));
+
+    auto max_rank = std::max(dimsA.size(), std::max(dimsB.size(), dimsC.size()));
+    while (dimsA.size() < max_rank)
+      dimsA.insert(dimsA.begin(), 1);
+    while (dimsB.size() < max_rank)
+      dimsB.insert(dimsB.begin(), 1);
+    while (dimsC.size() < max_rank)
+      dimsC.insert(dimsC.begin(), 1);
+
+    std::vector<int64_t> output_dims(dimsA.size());
+    for (size_t i = 0; i < dimsA.size(); ++i) {
+      output_dims[i] = std::max(std::max(dimsA[i], dimsB[i]), dimsC[i]);
+    }
+
+    if (switchMiddelAxis_) {
+      if (output_dims.size() != 4) {
+        ORTX_CXX_API_THROW("switchMiddleAxes only works with 4D tensors", ORT_RUNTIME_EXCEPTION);
+      }
+      int64_t d4 = output_dims[output_dims.size() - 1];
+      int64_t d3 = output_dims[output_dims.size() - 2];
+      int64_t d2 = output_dims[output_dims.size() - 3];
+      output_dims[1] = d3;
+      output_dims[2] = d2;
+      LaunchAddAndMulSwitchMiddleAxesKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
+                                               input_data_a, input_data_b, input_data_c,
+                                               output_data,
+                                               length_a, length_b, length_c,
+                                               addition_first, d2, d3, d4);
+    } else {
+      T* output_data_ab = output_ab.Allocate(output_dims);
+      LaunchAddAndMulKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
+                               input_data_a, input_data_b, input_data_c,
+                               output_data,
+                               length_a, length_b, length_c,
+                               addition_first, switchMiddelAxis_);
+    }
+    return {};
+  }
+
+ private:
+  bool switchMiddelAxis_;
+};
 
 }  // namespace contrib
\ No newline at end of file
diff --git a/operators/cuda/add_mul_impl.cu b/operators/cuda/add_mul_impl.cu
index 0373ddf28..ff52e00cb 100644
--- a/operators/cuda/add_mul_impl.cu
+++ b/operators/cuda/add_mul_impl.cu
@@ -126,12 +126,12 @@ cudaError_t LaunchAddOrMulSharedInputKernel<ortc::MFloat16>(cudaStream_t stream,
                                           length_a, length_b, length_c, addition);
 }
 
-__device__ __forceinline__ void _add3_op(float *address, const float a, const float b,
+__device__ __forceinline__ void _add3_op(float* address, const float a, const float b,
                                          const float c) {
   *address = a + b + c;
 }
 
-__device__ __forceinline__ void _add3_op(half *address, const half a, const half b,
+__device__ __forceinline__ void _add3_op(half* address, const half a, const half b,
                                          const half c) {
 #if __CUDA_ARCH__ < 700
   *address = __float2half(__half2float(a) + __half2float(b) + __half2float(c));
@@ -140,12 +140,12 @@ __device__ __forceinline__ void _add3_op(half *address, const half a, const half
 #endif
 }
 
-__device__ __forceinline__ void _mul3_op(float *address, const float a, const float b,
+__device__ __forceinline__ void _mul3_op(float* address, const float a, const float b,
                                          const float c) {
   *address = a * b * c;
 }
 
-__device__ __forceinline__ void _mul3_op(half *address, const half a, const half b,
+__device__ __forceinline__ void _mul3_op(half* address, const half a, const half b,
                                          const half c) {
 #if __CUDA_ARCH__ < 700
   *address = __float2half(__half2float(a) * __half2float(b) * __half2float(c));
@@ -154,14 +154,16 @@ __device__ __forceinline__ void _mul3_op(half *address, const half a, const half
 #endif
 }
 
-template <typename T> struct Mul3Op {
-  __device__ __inline__ void operator()(T *address, const T a, const T b, const T c) const {
+template <typename T>
+struct Mul3Op {
+  __device__ __inline__ void operator()(T* address, const T a, const T b, const T c) const {
     _mul3_op(address, a, b, c);
   }
 };
 
-template <typename T> struct Add3Op {
-  __device__ __inline__ void operator()(T *address, const T a, const T b, const T c) const {
+template <typename T>
+struct Add3Op {
+  __device__ __inline__ void operator()(T* address, const T a, const T b, const T c) const {
     _add3_op(address, a, b, c);
   }
 };
@@ -201,7 +203,7 @@ cudaError_t _LaunchAddOrMulTwiceKernel(cudaStream_t stream,
   if (addition) {
     AddMulTwiceKernel<TT, Add3Op<TT>, num_threads_per_block, num_elements_per_thread>
         <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-            reinterpret_cast<TT*>(output), 
+            reinterpret_cast<TT*>(output),
             reinterpret_cast<const TT*>(pA), reinterpret_cast<const TT*>(pB), reinterpret_cast<const TT*>(pC),
             static_cast<CUDA_LONG>(countA), static_cast<CUDA_LONG>(countB), static_cast<CUDA_LONG>(countC),
             static_cast<CUDA_LONG>(max_count), Add3SharedOp<TT>());
@@ -236,3 +238,164 @@ cudaError_t LaunchAddOrMulSharedInputKernel<ortc::MFloat16>(cudaStream_t stream,
                                           length_a, length_b, length_c, addition);
 }
 
+__device__ __forceinline__ void _addmul_op(float* address, const float a, const float b,
+                                           const float c) {
+  *address = (a + b) * c;
+}
+
+__device__ __forceinline__ void _addmul_op(half* address, const half a, const half b,
+                                           const half c) {
+#if __CUDA_ARCH__ < 700
+  *address = __float2half((__half2float(a) + __half2float(b)) * __half2float(c));
+#else
+  *address = (a + b) * c;
+#endif
+}
+
+__device__ __forceinline__ void _muladd_op(float* address, const float a, const float b,
+                                           const float c) {
+  *address = a * b + c;
+}
+
+__device__ __forceinline__ void _muladd_op(half* address, const half a, const half b,
+                                           const half c) {
+#if __CUDA_ARCH__ < 700
+  *address = __float2half(__half2float(a) * __half2float(b) + __half2float(c));
+#else
+  *address = a * b + c;
+#endif
+}
+
+template <typename T>
+struct AddMul {
+  __device__ __inline__ void operator()(T* address, const T a, const T b, const T c) const {
+    _addmul_op(address, a, b, c);
+  }
+};
+
+template <typename T>
+struct MulAdd {
+  __device__ __inline__ void operator()(T* address, const T a, const T b, const T c) const {
+    _muladd_op(address, a, b, c);
+  }
+};
+
+template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void _AddAndMulKernel(T* output_data, const T* pA, const T* pB, const T* pC,
+                                 CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC, CUDA_LONG N,
+                                 const TFunc func) {
+  CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+  CUDA_LONG id = start;
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      func(output_data + id, pA[id % nA], pB[id % nB], pC[id % nC]);
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void _AddAndMulSwitchMiddleAxesKernel(T* output_data, const T* pA, const T* pB,
+                                                 const T* pC, CUDA_LONG nA, CUDA_LONG nB,
+                                                 CUDA_LONG nC, CUDA_LONG N,
+                                                 const TFunc func, CUDA_LONG d2,
+                                                 CUDA_LONG d3, CUDA_LONG d4) {
+  CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+  CUDA_LONG id = start;
+  CUDA_LONG k, j, ido;
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      k = (id / d4) % d3;
+      j = (id / (d4 * d3)) % d2;
+      ido = id + d4 * ((k * d2 + j) - (j * d3 + k));
+      func(output_data + ido, pA[id % nA], pB[id % nB], pC[id % nC]);
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+template <typename T>
+cudaError_t _LaunchAddAndMulKernel(cudaStream_t stream,
+                                   const T* pA, const T* pB, const T* pC,
+                                   T* output,
+                                   int64_t countA, int64_t countB, int64_t countC,
+                                   bool addition_first, bool switchMiddleAxes) {
+  int64_t max_count = std::max(std::max(countA, countB), countC);
+  if (max_count == 0)  // special case where there's a dim value of 0 in the output shape
+    return cudaGetLastError();
+
+  const int num_elements_per_thread = 4;
+  const int num_threads_per_block = 256;
+  const int num_el_th = num_threads_per_block * num_elements_per_thread;
+
+  int blocksPerGrid = (max_count + num_el_th - 1) / num_el_th;
+
+  using TT = typename contrib::CudaT<T>::MappedType;
+
+  if (addition_first) {
+    AddAndMulKernel<TT, AddMul<TT>, num_threads_per_block, num_elements_per_thread>
+        <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+            cuda_stream,
+            reinterpret_cast<TT*>(output),
+            reinterpret_cast<const TT*>(pA),
+            reinterpret_cast<const TT*>(pB),
+            reinterpret_cast<const TT*>(pC),
+            countA, countB, countC,
+            max_size, AddMul<TT>());
+  } else {
+    AddAndMulKernel<TT, MulAdd<TT>, num_threads_per_block, num_elements_per_thread>
+        <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+            cuda_stream,
+            reinterpret_cast<TT*>(output),
+            reinterpret_cast<const TT*>(pA),
+            reinterpret_cast<const TT*>(pB),
+            reinterpret_cast<const TT*>(pC),
+            countA, countB, countC,
+            max_size, MulAdd<TT>());
+  }
+  return cudaGetLastError();
+}
+
+template <typename T>
+cudaError_t _LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream,
+                                                   const T* pA, const T* pB, const T* pC,
+                                                   T* output,
+                                                   int64_t countA, int64_t countB, int64_t countC,
+                                                   bool addition_first, int64_t d2, int64_t d3, int64_t d4) {
+  int64_t max_count = std::max(std::max(countA, countB), countC);
+  if (max_count == 0)  // special case where there's a dim value of 0 in the output shape
+    return cudaGetLastError();
+
+  const int num_elements_per_thread = 4;
+  const int num_threads_per_block = 256;
+  const int num_el_th = num_threads_per_block * num_elements_per_thread;
+
+  int blocksPerGrid = (max_count + num_el_th - 1) / num_el_th;
+
+  using TT = typename contrib::CudaT<T>::MappedType;
+
+  if (addition_first) {
+    AddAndMulSwitchMiddleAxesKernel<TT, AddMul<TT>, num_threads_per_block, num_elements_per_thread>
+        <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+            cuda_stream,
+            reinterpret_cast<TT*>(output),
+            reinterpret_cast<const TT*>(pA),
+            reinterpret_cast<const TT*>(pB),
+            reinterpret_cast<const TT*>(pC),
+            countA, countB, countC,
+            max_size, AddMul<TT>());
+  } else {
+    AddAndMulSwitchMiddleAxesKernel<TT, MulAdd<TT>, num_threads_per_block, num_elements_per_thread>
+        <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+            cuda_stream,
+            reinterpret_cast<TT*>(output),
+            reinterpret_cast<const TT*>(pA),
+            reinterpret_cast<const TT*>(pB),
+            reinterpret_cast<const TT*>(pC),
+            countA, countB, countC,
+            max_size, MulAdd<TT>());
+  }
+  return cudaGetLastError();
+}
diff --git a/operators/cuda/add_mul_impl.cuh b/operators/cuda/add_mul_impl.cuh
index 6fdf9369e..98bb06169 100644
--- a/operators/cuda/add_mul_impl.cuh
+++ b/operators/cuda/add_mul_impl.cuh
@@ -12,4 +12,15 @@ cudaError_t LaunchAddOrMulSharedInputKernel(cudaStream_t stream, const T* input_
 
 template <typename T>
 cudaError_t LaunchAddOrMulTwiceKernel(cudaStream_t stream, const T* input_a, const T* input_b, const T* input_c,
-                                      T* output, int64_t length_a, int64_t length_b, int64_t length_c, bool addition);
\ No newline at end of file
+                                      T* output, int64_t length_a, int64_t length_b, int64_t length_c, bool addition);
+
+template <typename T>
+cudaError_t LaunchAddAndMulKernel(cudaStream_t stream, const T* input_a, const T* input_b, const T* input_c,
+                                  T* output, int64_t length_a, int64_t length_b, int64_t length_c,
+                                  bool addition, bool switchMiddleAxis);
+
+template <typename T>
+cudaError_t LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream, const T* input_a, const T* input_b, const T* input_c,
+                                                  T* output, int64_t length_a, int64_t length_b, int64_t length_c,
+                                                  bool addition,
+                                                  int64_t d2, int64_t d3, int64_t d4);
diff --git a/operators/cuda/cuda_ops.cc b/operators/cuda/cuda_ops.cc
index 8c4808a5a..9ba3a7329 100644
--- a/operators/cuda/cuda_ops.cc
+++ b/operators/cuda/cuda_ops.cc
@@ -10,38 +10,46 @@
 #endif
 
 FxLoadCustomOpFactory LoadCustomOpClasses_Contrib = []() -> CustomOpArray& {
-
   using AddSharedInputFloat32Type = typename contrib::AddOrMulSharedInput<float, true>;
   using MulSharedInputFloat32Type = typename contrib::AddOrMulSharedInput<float, false>;
 
   using AddTwiceFloat32Type = typename contrib::AddOrMulTwice<float, true>;
   using MulTwiceFloat32Type = typename contrib::AddOrMulTwice<float, false>;
 
+  using AddAndMulFloat32Type = typename contrib::AddOrMulTwice<float, true>;
+  using MulAndAddFloat32Type = typename contrib::AddOrMulTwice<float, false>;
+
 #if ORT_API_VERSION >= 16
   using AddSharedInputFloat16Type = typename contrib::AddOrMulSharedInput<ortc::MFloat16, true>;
   using MulSharedInputFloat16Type = typename contrib::AddOrMulSharedInput<ortc::MFloat16, false>;
 
   using AddTwiceFloat16Type = typename contrib::AddOrMulTwice<ortc::MFloat16, true>;
   using MulTwiceFloat16Type = typename contrib::AddOrMulTwice<ortc::MFloat16, false>;
-#endif
 
+  using AddAndMulFloat32Type = typename contrib::AddOrMulTwice<ortc::MFloat16, true>;
+  using MulAndAddFloat32Type = typename contrib::AddOrMulTwice<ortc::MFloat16, false>;
+#endif
 
   static OrtOpLoader op_loader(
       []() { return nullptr; }
 #ifdef USE_CUDA
       ,
+      CustomCudaStructV2("AddMul", AddAndMulFloat32Type),
       CustomCudaStructV2("AddSharedInput", AddSharedInputFloat32Type),
       CustomCudaStructV2("AddTwice", AddTwiceFloat32Type),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<float>),
+      CustomCudaStructV2("MulAdd", MulAndAddFloat32Type),
       CustomCudaStructV2("MulSharedInput", MulSharedInputFloat32Type),
       CustomCudaStructV2("MulTwice", MulTwiceFloat32Type),
       CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<float>),
 #if ORT_API_VERSION >= 16
 
+      CustomCudaStructV2("AddMul", AddAndMulFloat16Type),
       CustomCudaStructV2("AddSharedInput", AddSharedInputFloat16Type),
       CustomCudaStructV2("AddTwice", AddTwiceFloat16Type),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<ortc::MFloat16>),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<ortc::BFloat16>),
+      CustomCudaStructV2("MulAdd", MulAndAddFloat16Type),
       CustomCudaStructV2("MulSharedInput", MulSharedInputFloat16Type),
       CustomCudaStructV2("MulTwice", MulTwiceFloat16Type),
       CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<ortc::MFloat16>)
diff --git a/test/cuda/test_cudaops.py b/test/cuda/test_cudaops.py
index 7955795db..db11b212c 100644
--- a/test/cuda/test_cudaops.py
+++ b/test/cuda/test_cudaops.py
@@ -339,6 +339,90 @@ def test_add_shared_input_cuda_broadcast2(self):
             shapec=(3, 2, 3),
         )
 
+    def _addmul_cuda(self, itype, op_type1, op_type2, broad=False, negative=False):
+        model1 = helper.make_model(
+            helper.make_graph(
+                [
+                    helper.make_node(op_type1, ["Y", "X"] if negative else ["X", "Y"], ["xy"]),
+                    helper.make_node(op_type2, ["Z", "xy"] if negative else ["xy", "Z"], ["final"]),
+                ],
+                "nd",
+                [
+                    helper.make_tensor_value_info("X", itype, [None, None, None]),
+                    helper.make_tensor_value_info("Y", itype, [None, None, None]),
+                    helper.make_tensor_value_info("Z", itype, [None, None, None]),
+                ],
+                [helper.make_tensor_value_info("final", itype, [None, None, None])],
+            ),
+            opset_imports=[helper.make_opsetid("", 18)],
+            ir_version=9,
+        )
+
+        kwargs = {"negative": 1} if negative else {}
+        model2 = helper.make_model(
+            helper.make_graph(
+                [
+                    helper.make_node(
+                        f"{op_type1}{op_type2}",
+                        ["X", "Y", "Z"],
+                        ["final"],
+                        domain="ai.onnx.contrib",
+                        **kwargs,
+                    )
+                ],
+                "nd",
+                [
+                    helper.make_tensor_value_info("X", itype, [None, None, None]),
+                    helper.make_tensor_value_info("Y", itype, [None, None, None]),
+                    helper.make_tensor_value_info("Z", itype, [None, None, None]),
+                ],
+                [helper.make_tensor_value_info("final", itype, [None, None, None])],
+            ),
+            opset_imports=[
+                helper.make_opsetid("", 18),
+                helper.make_opsetid("ai.onnx.contrib", 1),
+            ],
+            ir_version=9,
+        )
+
+        dtype = np.float32 if itype == TensorProto.FLOAT else np.float16
+        shapex = (1, 2, 3) if broad else (3, 2, 3)
+        shapey = (3, 2, 3)
+        shapez = (1, 2, 3) if broad else (3, 2, 3)
+        x = (np.arange(np.prod(shapex)) + 1).reshape(shapex).astype(dtype)
+        y = (np.arange(np.prod(shapey)) + 1).reshape(shapey).astype(dtype)
+        z = (np.arange(np.prod(shapez)) + 1).reshape(shapez).astype(dtype)
+
+        feeds1 = dict(X=x, Y=y, Z=z)
+        ref = ReferenceEvaluator(model1, verbose=0)
+        expected = ref.run(None, feeds1)[0]
+
+        opts = _ort.SessionOptions()
+        opts.register_custom_ops_library(_get_library_path())
+        sess = _ort.InferenceSession(model2.SerializeToString(), opts, providers=["CUDAExecutionProvider"])
+        got = sess.run(None, feeds1)[0]
+        assert_almost_equal(expected, got)
+
+    @unittest.skipIf(not has_cuda(), reason="cuda not available")
+    def test_addmul_cuda(self):
+        self._addmul_cuda(TensorProto.FLOAT, "Add", "Mul")
+        self._addmul_cuda(TensorProto.FLOAT16, "Add", "Mul")
+
+    @unittest.skipIf(not has_cuda(), reason="cuda not available")
+    def test_addmul_cuda_broadcast(self):
+        self._addmul_cuda(TensorProto.FLOAT, "Add", "Mul", True)
+        self._addmul_cuda(TensorProto.FLOAT16, "Add", "Mul", True)
+
+    @unittest.skipIf(not has_cuda(), reason="cuda not available")
+    def test_muladd_cuda(self):
+        self._addmul_cuda(TensorProto.FLOAT, "Mul", "Add")
+        self._addmul_cuda(TensorProto.FLOAT16, "Mul", "Add")
+
+    @unittest.skipIf(not has_cuda(), reason="cuda not available")
+    def test_submul_cuda(self):
+        self._addmul_cuda(TensorProto.FLOAT, "Sub", "Mul")
+        self._addmul_cuda(TensorProto.FLOAT16, "Sub", "Mul")
+
 
 if __name__ == "__main__":
     unittest.main()

From cd66c02d1ad68349f5d0880c103f67b28032523b Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sat, 8 Jun 2024 23:16:35 +0200
Subject: [PATCH 3/7] add submul

---
 operators/cuda/add_mul.h        |  56 +++++++-
 operators/cuda/add_mul_impl.cu  | 218 +++++++++++++++++++++++++++++++-
 operators/cuda/add_mul_impl.cuh |   7 +-
 operators/cuda/cuda_ops.cc      |  20 ++-
 test/cuda/test_cudaops.py       |  20 +++
 5 files changed, 313 insertions(+), 8 deletions(-)

diff --git a/operators/cuda/add_mul.h b/operators/cuda/add_mul.h
index 43a1b97e2..a23e07b8c 100644
--- a/operators/cuda/add_mul.h
+++ b/operators/cuda/add_mul.h
@@ -141,7 +141,7 @@ struct AddAndMul {
                                input_data_a, input_data_b, input_data_c,
                                output_data,
                                length_a, length_b, length_c,
-                               addition_first, switchMiddelAxis_);
+                               addition_first);
     }
     return {};
   }
@@ -150,4 +150,58 @@ struct AddAndMul {
   bool switchMiddelAxis_;
 };
 
+template <typename T, bool subtract_first>
+struct SubAndMul {
+  template <typename TDict>
+  OrtxStatus OnModelAttach(const TDict& dict) {
+    return {};
+  }
+  OrtxStatus Compute(Ort::Custom::CUDAKernelContext* ctx,
+                     const ortc::Tensor<T>& tensor_a,
+                     const ortc::Tensor<T>& tensor_b,
+                     const ortc::Tensor<T>& tensor_c,
+                     ortc::Tensor<T>& output) const {
+    const T* input_data_a = tensor_a.Data();
+    const T* input_data_b = tensor_b.Data();
+    const T* input_data_c = tensor_c.Data();
+
+    auto length_a = tensor_a.NumberOfElement();
+    auto length_b = tensor_b.NumberOfElement();
+    auto length_c = tensor_c.NumberOfElement();
+    if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
+      return {};
+    }
+
+    std::vector<int64_t> dimsA = tensor_a.Shape();
+    std::vector<int64_t> dimsB = tensor_b.Shape();
+    std::vector<int64_t> dimsC = tensor_c.Shape();
+
+    auto max_length = std::max(length_a, std::max(length_b, length_c));
+
+    auto max_rank = std::max(dimsA.size(), std::max(dimsB.size(), dimsC.size()));
+    while (dimsA.size() < max_rank)
+      dimsA.insert(dimsA.begin(), 1);
+    while (dimsB.size() < max_rank)
+      dimsB.insert(dimsB.begin(), 1);
+    while (dimsC.size() < max_rank)
+      dimsC.insert(dimsC.begin(), 1);
+
+    std::vector<int64_t> output_dims(dimsA.size());
+    for (size_t i = 0; i < dimsA.size(); ++i) {
+      output_dims[i] = std::max(std::max(dimsA[i], dimsB[i]), dimsC[i]);
+    }
+
+    T* output_data_ab = output_ab.Allocate(output_dims);
+    LaunchSubAndMulKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
+                             input_data_a, input_data_b, input_data_c,
+                             output_data,
+                             length_a, length_b, length_c,
+                             subtract_first, negative_);
+    return {};
+  }
+
+ private:
+  bool negative_;
+};
+
 }  // namespace contrib
\ No newline at end of file
diff --git a/operators/cuda/add_mul_impl.cu b/operators/cuda/add_mul_impl.cu
index ff52e00cb..14bf73b62 100644
--- a/operators/cuda/add_mul_impl.cu
+++ b/operators/cuda/add_mul_impl.cu
@@ -321,7 +321,7 @@ cudaError_t _LaunchAddAndMulKernel(cudaStream_t stream,
                                    const T* pA, const T* pB, const T* pC,
                                    T* output,
                                    int64_t countA, int64_t countB, int64_t countC,
-                                   bool addition_first, bool switchMiddleAxes) {
+                                   bool addition_first) {
   int64_t max_count = std::max(std::max(countA, countB), countC);
   if (max_count == 0)  // special case where there's a dim value of 0 in the output shape
     return cudaGetLastError();
@@ -358,6 +358,22 @@ cudaError_t _LaunchAddAndMulKernel(cudaStream_t stream,
   return cudaGetLastError();
 }
 
+template <>
+cudaError_t LaunchAddAndMulKernel(cudaStream_t stream, const float* input_a, const float* input_b, const float* input_c,
+                                  float* output, int64_t length_a, int64_t length_b, int64_t length_c,
+                                  bool addition) {
+  return _LaunchAddAndMulKernel(stream, pA, pB, pC, output, countA, countB, countC, addition_first);
+}
+
+template <>
+cudaError_t LaunchAddAndMulKernel(cudaStream_t stream,
+                                  const ortc::MFloat16* input_a, const ortc::MFloat16* input_b,
+                                  const ortc::MFloat16* input_c,
+                                  ortc::MFloat16* output, int64_t length_a, int64_t length_b, int64_t length_c,
+                                  bool addition) {
+  return _LaunchAddAndMulKernel(stream, pA, pB, pC, output, countA, countB, countC, addition_first);
+}
+
 template <typename T>
 cudaError_t _LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream,
                                                    const T* pA, const T* pB, const T* pC,
@@ -399,3 +415,203 @@ cudaError_t _LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream,
   }
   return cudaGetLastError();
 }
+
+template <>
+cudaError_t LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream, const float* input_a, const float* input_b, const float* input_c,
+                                                  float* output, int64_t length_a, int64_t length_b, int64_t length_c,
+                                                  bool addition,
+                                                  int64_t d2, int64_t d3, int64_t d4) {
+  return _LaunchAddAndMulSwitchMiddleAxesKernel(stream, pA, pB, pC, output, countA, countB, countC,
+                                                addition_first, d2, d3, d4);
+}
+
+template <>
+cudaError_t LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream, const ortc::MFloat16* input_a,
+                                                  const ortc::MFloat16* input_b, const ortc::MFloat16* input_c,
+                                                  ortc::MFloat16* output, int64_t length_a, int64_t length_b, int64_t length_c,
+                                                  bool addition,
+                                                  int64_t d2, int64_t d3, int64_t d4) {
+  return _LaunchAddAndMulSwitchMiddleAxesKernel(stream, pA, pB, pC, output, countA, countB, countC,
+                                                addition_first, d2, d3, d4);
+}
+
+__device__ __forceinline__ void _submul_op(float* address, const float a, const float b,
+                                           const float c) {
+  *address = (a - b) * c;
+}
+
+__device__ __forceinline__ void _submul_op(half* address, const half a, const half b,
+                                           const half c) {
+#if __CUDA_ARCH__ < 700
+  *address = __float2half((__half2float(a) - __half2float(b)) * __half2float(c));
+#else
+  *address = (a - b) * c;
+#endif
+}
+
+__device__ __forceinline__ void _submul_neg_op(float* address, const float a, const float b,
+                                               const float c) {
+  *address = (b - a) * c;
+}
+
+__device__ __forceinline__ void _submul_neg_op(half* address, const half a, const half b,
+                                               const half c) {
+#if __CUDA_ARCH__ < 700
+  *address = __float2half((__half2float(b) - __half2float(a)) * __half2float(c));
+#else
+  *address = (b - a) * c;
+#endif
+}
+
+__device__ __forceinline__ void _mulsub_op(float* address, const float a, const float b,
+                                           const float c) {
+  *address = a * b - c;
+}
+
+__device__ __forceinline__ void _mulsub_op(half* address, const half a, const half b,
+                                           const half c) {
+#if __CUDA_ARCH__ < 700
+  *address = __float2half(__half2float(a) * __half2float(b) - __half2float(c));
+#else
+  *address = a * b - c;
+#endif
+}
+
+__device__ __forceinline__ void _mulsub_neg_op(float* address, const float a, const float b,
+                                               const float c) {
+  *address = c - a * b;
+}
+
+__device__ __forceinline__ void _mulsub_neg_op(half* address, const half a, const half b,
+                                               const half c) {
+#if __CUDA_ARCH__ < 700
+  *address = __float2half(__half2float(c) - __half2float(a) * __half2float(b));
+#else
+  *address = c - a * b;
+#endif
+}
+
+template <typename T>
+struct SubMul {
+  __device__ __inline__ void operator()(T* address, const T a, const T b, const T c) const {
+    _submul_op(address, a, b, c);
+  }
+};
+
+template <typename T>
+struct MulSub {
+  __device__ __inline__ void operator()(T* address, const T a, const T b, const T c) const {
+    _mulsub_op(address, a, b, c);
+  }
+};
+
+template <typename T>
+struct SubMulNeg {
+  __device__ __inline__ void operator()(T* address, const T a, const T b, const T c) const {
+    _submul_neg_op(address, a, b, c);
+  }
+};
+
+template <typename T>
+struct MulSubNeg {
+  __device__ __inline__ void operator()(T* address, const T a, const T b, const T c) const {
+    _mulsub_neg_op(address, a, b, c);
+  }
+};
+
+template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void _MulSubKernel(T* output_data, const T* pA, const T* pB, const T* pC,
+                              CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC, CUDA_LONG N,
+                              const TFunc func) {
+  CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+  CUDA_LONG id = start;
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      func(output_data + id, pA[id % nA], pB[id % nB], pC[id % nC]);
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+template <typename T>
+cudaError_t _LaunchSubAndMulKernel(cudaStream_t stream,
+                                   const T* pA, const T* pB, const T* pC,
+                                   T* output,
+                                   int64_t countA, int64_t countB, int64_t countC,
+                                   bool addition_first) {
+  int64_t max_count = std::max(std::max(countA, countB), countC);
+  if (max_count == 0)  // special case where there's a dim value of 0 in the output shape
+    return cudaGetLastError();
+
+  const int num_elements_per_thread = 4;
+  const int num_threads_per_block = 256;
+  const int num_el_th = num_threads_per_block * num_elements_per_thread;
+
+  int blocksPerGrid = (max_count + num_el_th - 1) / num_el_th;
+
+  using TT = typename contrib::CudaT<T>::MappedType;
+
+  if (addition_first) {
+    if (negative) {
+      SubAndMulKernel<TT, SubMul<TT>, num_threads_per_block, num_elements_per_thread>
+          <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+              cuda_stream,
+              reinterpret_cast<TT*>(output),
+              reinterpret_cast<const TT*>(pA),
+              reinterpret_cast<const TT*>(pB),
+              reinterpret_cast<const TT*>(pC),
+              countA, countB, countC,
+              max_size, SubMulNEg<TT>());
+    } else {
+      SubAndMulKernel<TT, SubMul<TT>, num_threads_per_block, num_elements_per_thread>
+          <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+              cuda_stream,
+              reinterpret_cast<TT*>(output),
+              reinterpret_cast<const TT*>(pA),
+              reinterpret_cast<const TT*>(pB),
+              reinterpret_cast<const TT*>(pC),
+              countA, countB, countC,
+              max_size, SubMul<TT>());
+    }
+  } else {
+    if (negative) {
+      SubAndMulKernel<TT, MulSub<TT>, num_threads_per_block, num_elements_per_thread>
+          <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+              cuda_stream,
+              reinterpret_cast<TT*>(output),
+              reinterpret_cast<const TT*>(pA),
+              reinterpret_cast<const TT*>(pB),
+              reinterpret_cast<const TT*>(pC),
+              countA, countB, countC,
+              max_size, MulSubNeg<TT>());
+    } else {
+      SubAndMulKernel<TT, MulSub<TT>, num_threads_per_block, num_elements_per_thread>
+          <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
+              cuda_stream,
+              reinterpret_cast<TT*>(output),
+              reinterpret_cast<const TT*>(pA),
+              reinterpret_cast<const TT*>(pB),
+              reinterpret_cast<const TT*>(pC),
+              countA, countB, countC,
+              max_size, MulSub<TT>());
+    }
+  }
+  return cudaGetLastError();
+}
+
+template <>
+cudaError_t LaunchSubAndMulKernel(cudaStream_t stream, const float* input_a, const float* input_b, const float* input_c,
+                                  float* output, int64_t length_a, int64_t length_b, int64_t length_c,
+                                  bool subtract_first, bool negative) {
+  return _LaunchSubAndMulKernel(stream, pA, pB, pC, output, countA, countB, countC, subtract_first, negative);
+}
+
+template <>
+cudaError_t LaunchSubAndMulKernel(cudaStream_t stream,
+                                  const ortc::MFloat16* input_a, const ortc::MFloat16* input_b,
+                                  const ortc::MFloat16* input_c,
+                                  ortc::MFloat16* output, int64_t length_a, int64_t length_b, int64_t length_c,
+                                  bool subtract_first, negative) {
+  return _LaunchSubAndMulKernel(stream, pA, pB, pC, output, countA, countB, countC, subtract_first, negative);
+}
diff --git a/operators/cuda/add_mul_impl.cuh b/operators/cuda/add_mul_impl.cuh
index 98bb06169..2c5b000ec 100644
--- a/operators/cuda/add_mul_impl.cuh
+++ b/operators/cuda/add_mul_impl.cuh
@@ -17,10 +17,15 @@ cudaError_t LaunchAddOrMulTwiceKernel(cudaStream_t stream, const T* input_a, con
 template <typename T>
 cudaError_t LaunchAddAndMulKernel(cudaStream_t stream, const T* input_a, const T* input_b, const T* input_c,
                                   T* output, int64_t length_a, int64_t length_b, int64_t length_c,
-                                  bool addition, bool switchMiddleAxis);
+                                  bool addition);
 
 template <typename T>
 cudaError_t LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream, const T* input_a, const T* input_b, const T* input_c,
                                                   T* output, int64_t length_a, int64_t length_b, int64_t length_c,
                                                   bool addition,
                                                   int64_t d2, int64_t d3, int64_t d4);
+
+template <typename T>
+cudaError_t LaunchSubAndMulKernel(cudaStream_t stream, const T* input_a, const T* input_b, const T* input_c,
+                                  T* output, int64_t length_a, int64_t length_b, int64_t length_c,
+                                  bool addition, bool negative);
diff --git a/operators/cuda/cuda_ops.cc b/operators/cuda/cuda_ops.cc
index 9ba3a7329..21feec2e2 100644
--- a/operators/cuda/cuda_ops.cc
+++ b/operators/cuda/cuda_ops.cc
@@ -16,8 +16,11 @@ FxLoadCustomOpFactory LoadCustomOpClasses_Contrib = []() -> CustomOpArray& {
   using AddTwiceFloat32Type = typename contrib::AddOrMulTwice<float, true>;
   using MulTwiceFloat32Type = typename contrib::AddOrMulTwice<float, false>;
 
-  using AddAndMulFloat32Type = typename contrib::AddOrMulTwice<float, true>;
-  using MulAndAddFloat32Type = typename contrib::AddOrMulTwice<float, false>;
+  using AddAndMulFloat32Type = typename contrib::AddAndMul<float, true>;
+  using MulAndAddFloat32Type = typename contrib::AddAndMul<float, false>;
+
+  using SubAndMulFloat32Type = typename contrib::SubAndMul<float, true>;
+  using MulAndSubFloat32Type = typename contrib::SubAndMul<float, false>;
 
 #if ORT_API_VERSION >= 16
   using AddSharedInputFloat16Type = typename contrib::AddOrMulSharedInput<ortc::MFloat16, true>;
@@ -26,8 +29,11 @@ FxLoadCustomOpFactory LoadCustomOpClasses_Contrib = []() -> CustomOpArray& {
   using AddTwiceFloat16Type = typename contrib::AddOrMulTwice<ortc::MFloat16, true>;
   using MulTwiceFloat16Type = typename contrib::AddOrMulTwice<ortc::MFloat16, false>;
 
-  using AddAndMulFloat32Type = typename contrib::AddOrMulTwice<ortc::MFloat16, true>;
-  using MulAndAddFloat32Type = typename contrib::AddOrMulTwice<ortc::MFloat16, false>;
+  using AddAndMulFloat32Type = typename contrib::AddAndMul<ortc::MFloat16, true>;
+  using MulAndAddFloat32Type = typename contrib::AddAndMul<ortc::MFloat16, false>;
+
+  using SubAndMulFloat32Type = typename contrib::SubAndMul<ortc::MFloat16, true>;
+  using MulAndSubFloat32Type = typename contrib::SubAndMul<ortc::MFloat16, false>;
 #endif
 
   static OrtOpLoader op_loader(
@@ -40,8 +46,10 @@ FxLoadCustomOpFactory LoadCustomOpClasses_Contrib = []() -> CustomOpArray& {
       CustomCudaStructV2("FastGelu", contrib::FastGelu<float>),
       CustomCudaStructV2("MulAdd", MulAndAddFloat32Type),
       CustomCudaStructV2("MulSharedInput", MulSharedInputFloat32Type),
+      CustomCudaStructV2("MulSub", MulAndSubFloat32Type),
       CustomCudaStructV2("MulTwice", MulTwiceFloat32Type),
       CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<float>),
+      CustomCudaStructV2("SubMul", SubAndMulFloat32Type),
 #if ORT_API_VERSION >= 16
 
       CustomCudaStructV2("AddMul", AddAndMulFloat16Type),
@@ -51,8 +59,10 @@ FxLoadCustomOpFactory LoadCustomOpClasses_Contrib = []() -> CustomOpArray& {
       CustomCudaStructV2("FastGelu", contrib::FastGelu<ortc::BFloat16>),
       CustomCudaStructV2("MulAdd", MulAndAddFloat16Type),
       CustomCudaStructV2("MulSharedInput", MulSharedInputFloat16Type),
+      CustomCudaStructV2("MulSub", MulAndSubFloat16Type),
       CustomCudaStructV2("MulTwice", MulTwiceFloat16Type),
-      CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<ortc::MFloat16>)
+      CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<ortc::MFloat16>),
+      CustomCudaStructV2("SubMul", SubAndMulFloat16Type)
 #endif
 #endif
   );
diff --git a/test/cuda/test_cudaops.py b/test/cuda/test_cudaops.py
index db11b212c..ca94646fc 100644
--- a/test/cuda/test_cudaops.py
+++ b/test/cuda/test_cudaops.py
@@ -423,6 +423,26 @@ def test_submul_cuda(self):
         self._addmul_cuda(TensorProto.FLOAT, "Sub", "Mul")
         self._addmul_cuda(TensorProto.FLOAT16, "Sub", "Mul")
 
+    @unittest.skipIf(not has_cuda(), reason="cuda not available")
+    def test_submul_cuda_negative(self):
+        self._addmul_cuda(TensorProto.FLOAT, "Sub", "Mul", negative=True)
+        self._addmul_cuda(TensorProto.FLOAT16, "Sub", "Mul", negative=True)
+
+    @unittest.skipIf(not has_cuda(), reason="cuda not available")
+    def test_submul_cuda_broadcast(self):
+        self._addmul_cuda(TensorProto.FLOAT, "Sub", "Mul", True)
+        self._addmul_cuda(TensorProto.FLOAT16, "Sub", "Mul", True)
+
+    @unittest.skipIf(not has_cuda(), reason="cuda not available")
+    def test_mulsub_cuda(self):
+        self._addmul_cuda(TensorProto.FLOAT, "Mul", "Sub")
+        self._addmul_cuda(TensorProto.FLOAT16, "Mul", "Sub")
+
+    @unittest.skipIf(not has_cuda(), reason="cuda not available")
+    def test_mulsub_cuda_negative(self):
+        self._addmul_cuda(TensorProto.FLOAT, "Mul", "Sub", negative=True)
+        self._addmul_cuda(TensorProto.FLOAT16, "Mul", "Sub", negative=True)
+
 
 if __name__ == "__main__":
     unittest.main()

From cf1e47f7878a8d7043688ac66c73cb9d12355073 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Mon, 10 Jun 2024 11:00:20 +0000
Subject: [PATCH 4/7] fix compilation

---
 operators/cuda/add_mul.h       | 124 ++++++++++++--------------
 operators/cuda/add_mul_impl.cu | 158 +++++++++++++++++----------------
 operators/cuda/cuda_ops.cc     |  16 ++--
 test/cuda/test_cudaops.py      |  10 +--
 4 files changed, 149 insertions(+), 159 deletions(-)

diff --git a/operators/cuda/add_mul.h b/operators/cuda/add_mul.h
index a23e07b8c..4995642e7 100644
--- a/operators/cuda/add_mul.h
+++ b/operators/cuda/add_mul.h
@@ -8,6 +8,27 @@
 
 namespace contrib {
 
+inline void _FillOutputShape3Op(std::vector<int64_t>& dimsA,
+                                std::vector<int64_t>& dimsB,
+                                std::vector<int64_t>& dimsC,
+                                std::vector<int64_t>& output_dims) {
+    auto max_rank = std::max(dimsA.size(), std::max(dimsB.size(), dimsC.size()));
+    while (dimsA.size() < max_rank)
+      dimsA.insert(dimsA.begin(), 1);
+    while (dimsB.size() < max_rank)
+      dimsB.insert(dimsB.begin(), 1);
+    while (dimsC.size() < max_rank)
+      dimsC.insert(dimsC.begin(), 1);
+
+    output_dims.resize(dimsA.size());
+    for (size_t i = 0; i < dimsA.size(); ++i) {
+      output_dims[i] = std::max(std::max(dimsA[i], dimsB[i]), dimsC[i]);
+      if (output_dims[i] == 0) {
+        ORTX_CXX_API_THROW("One of the input dimensions is null.", ORT_RUNTIME_EXCEPTION);        
+      }
+    }
+}
+
 template <typename T, bool addition>
 struct AddOrMulSharedInput {
   template <typename TDict>
@@ -20,22 +41,19 @@ struct AddOrMulSharedInput {
                      const ortc::Tensor<T>& tensor_c,
                      ortc::Tensor<T>& output_ab,
                      ortc::Tensor<T>& output_ac) const {
-    const T* input_data_a = tensor_a.Data();
-    const T* input_data_b = tensor_b.Data();
-    const T* input_data_c = tensor_c.Data();
-
     auto length_a = tensor_a.NumberOfElement();
     auto length_b = tensor_b.NumberOfElement();
     auto length_c = tensor_c.NumberOfElement();
 
-    T* output_data_ab = output_ab.Allocate(length_a <= length_b ? tensor_b.Shape() : tensor_a.Shape());
-    T* output_data_ac = output_ab.Allocate(length_a <= length_c ? tensor_c.Shape() : tensor_a.Shape());
-
-    if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
+    if (0 == length_a || 0 == length_b || 0 == length_c) {
       return {};
     }
+
+    T* output_data_ab = output_ab.Allocate(length_a <= length_b ? tensor_b.Shape() : tensor_a.Shape());
+    T* output_data_ac = output_ac.Allocate(length_a <= length_c ? tensor_c.Shape() : tensor_a.Shape());
+
     LaunchAddOrMulSharedInputKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
-                                       input_data_a, input_data_b, input_data_c,
+                                       tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
                                        output_data_ab, output_data_ac,
                                        length_a, length_b, length_c,
                                        addition);
@@ -54,25 +72,24 @@ struct AddOrMulTwice {
                      const ortc::Tensor<T>& tensor_b,
                      const ortc::Tensor<T>& tensor_c,
                      ortc::Tensor<T>& output) const {
-    const T* input_data_a = tensor_a.Data();
-    const T* input_data_b = tensor_b.Data();
-    const T* input_data_c = tensor_c.Data();
-
     auto length_a = tensor_a.NumberOfElement();
     auto length_b = tensor_b.NumberOfElement();
     auto length_c = tensor_c.NumberOfElement();
 
-    T* output_data_ab = output_ab.Allocate(
-        length_a <= length_b
-            ? lenght_c <= length_b ? tensor_b.Shape() : tensor_c.Shape()
-        : lenght_a <= length_b ? tensor_b.Shape()
-                               : tensor_a.Shape());
-
-    if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
+    if (0 == length_a || 0 == length_b || 0 == length_c) {
       return {};
     }
+
+    std::vector<int64_t> dimsA = tensor_a.Shape();
+    std::vector<int64_t> dimsB = tensor_b.Shape();
+    std::vector<int64_t> dimsC = tensor_c.Shape();
+    std::vector<int64_t> output_dims;
+    _FillOutputShape3Op(dimsA, dimsB, dimsC, output_dims);
+
+    T* output_data = output.Allocate(output_dims);
+
     LaunchAddOrMulTwiceKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
-                                 input_data_a, input_data_b, input_data_c,
+                                 tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
                                  output_data,
                                  length_a, length_b, length_c,
                                  addition);
@@ -84,42 +101,27 @@ template <typename T, bool addition_first>
 struct AddAndMul {
   template <typename TDict>
   OrtxStatus OnModelAttach(const TDict& dict) {
-    return {};
+    int64_t default_value = 0;
+    switchMiddelAxis_ = dict.TryToGetAttributeWithDefault("switchMiddleAxis", default_value) == 1;
   }
   OrtxStatus Compute(Ort::Custom::CUDAKernelContext* ctx,
                      const ortc::Tensor<T>& tensor_a,
                      const ortc::Tensor<T>& tensor_b,
                      const ortc::Tensor<T>& tensor_c,
                      ortc::Tensor<T>& output) const {
-    const T* input_data_a = tensor_a.Data();
-    const T* input_data_b = tensor_b.Data();
-    const T* input_data_c = tensor_c.Data();
-
     auto length_a = tensor_a.NumberOfElement();
     auto length_b = tensor_b.NumberOfElement();
     auto length_c = tensor_c.NumberOfElement();
-    if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
+
+    if (0 == length_a || 0 == length_b || 0 == length_c) {
       return {};
     }
 
     std::vector<int64_t> dimsA = tensor_a.Shape();
     std::vector<int64_t> dimsB = tensor_b.Shape();
     std::vector<int64_t> dimsC = tensor_c.Shape();
-
-    auto max_length = std::max(length_a, std::max(length_b, length_c));
-
-    auto max_rank = std::max(dimsA.size(), std::max(dimsB.size(), dimsC.size()));
-    while (dimsA.size() < max_rank)
-      dimsA.insert(dimsA.begin(), 1);
-    while (dimsB.size() < max_rank)
-      dimsB.insert(dimsB.begin(), 1);
-    while (dimsC.size() < max_rank)
-      dimsC.insert(dimsC.begin(), 1);
-
-    std::vector<int64_t> output_dims(dimsA.size());
-    for (size_t i = 0; i < dimsA.size(); ++i) {
-      output_dims[i] = std::max(std::max(dimsA[i], dimsB[i]), dimsC[i]);
-    }
+    std::vector<int64_t> output_dims;
+    _FillOutputShape3Op(dimsA, dimsB, dimsC, output_dims);
 
     if (switchMiddelAxis_) {
       if (output_dims.size() != 4) {
@@ -130,15 +132,16 @@ struct AddAndMul {
       int64_t d2 = output_dims[output_dims.size() - 3];
       output_dims[1] = d3;
       output_dims[2] = d2;
+      T* output_data = output.Allocate(output_dims);
       LaunchAddAndMulSwitchMiddleAxesKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
-                                               input_data_a, input_data_b, input_data_c,
+                                               tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
                                                output_data,
                                                length_a, length_b, length_c,
                                                addition_first, d2, d3, d4);
     } else {
-      T* output_data_ab = output_ab.Allocate(output_dims);
+      T* output_data = output.Allocate(output_dims);
       LaunchAddAndMulKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
-                               input_data_a, input_data_b, input_data_c,
+                               tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
                                output_data,
                                length_a, length_b, length_c,
                                addition_first);
@@ -154,46 +157,31 @@ template <typename T, bool subtract_first>
 struct SubAndMul {
   template <typename TDict>
   OrtxStatus OnModelAttach(const TDict& dict) {
-    return {};
+    //int64_t default_value = 0;
+    //negative_ = dict.TryToGetAttributeWithDefault("negative", default_value) == 1;
+    negative_ = false;
   }
   OrtxStatus Compute(Ort::Custom::CUDAKernelContext* ctx,
                      const ortc::Tensor<T>& tensor_a,
                      const ortc::Tensor<T>& tensor_b,
                      const ortc::Tensor<T>& tensor_c,
                      ortc::Tensor<T>& output) const {
-    const T* input_data_a = tensor_a.Data();
-    const T* input_data_b = tensor_b.Data();
-    const T* input_data_c = tensor_c.Data();
-
     auto length_a = tensor_a.NumberOfElement();
     auto length_b = tensor_b.NumberOfElement();
     auto length_c = tensor_c.NumberOfElement();
-    if (0 == input_data_a || 0 == input_data_b || 0 == input_data_c) {
+    if (0 == length_a || 0 == length_b || 0 == length_c) {
       return {};
     }
 
     std::vector<int64_t> dimsA = tensor_a.Shape();
     std::vector<int64_t> dimsB = tensor_b.Shape();
     std::vector<int64_t> dimsC = tensor_c.Shape();
+    std::vector<int64_t> output_dims;
+    _FillOutputShape3Op(dimsA, dimsB, dimsC, output_dims);
+    T* output_data = output.Allocate(output_dims);
 
-    auto max_length = std::max(length_a, std::max(length_b, length_c));
-
-    auto max_rank = std::max(dimsA.size(), std::max(dimsB.size(), dimsC.size()));
-    while (dimsA.size() < max_rank)
-      dimsA.insert(dimsA.begin(), 1);
-    while (dimsB.size() < max_rank)
-      dimsB.insert(dimsB.begin(), 1);
-    while (dimsC.size() < max_rank)
-      dimsC.insert(dimsC.begin(), 1);
-
-    std::vector<int64_t> output_dims(dimsA.size());
-    for (size_t i = 0; i < dimsA.size(); ++i) {
-      output_dims[i] = std::max(std::max(dimsA[i], dimsB[i]), dimsC[i]);
-    }
-
-    T* output_data_ab = output_ab.Allocate(output_dims);
     LaunchSubAndMulKernel<T>(reinterpret_cast<cudaStream_t>(ctx->GetCudaStream()),
-                             input_data_a, input_data_b, input_data_c,
+                             tensor_a.Data(), tensor_b.Data(), tensor_c.Data(),
                              output_data,
                              length_a, length_b, length_c,
                              subtract_first, negative_);
diff --git a/operators/cuda/add_mul_impl.cu b/operators/cuda/add_mul_impl.cu
index 14bf73b62..20919c467 100644
--- a/operators/cuda/add_mul_impl.cu
+++ b/operators/cuda/add_mul_impl.cu
@@ -169,15 +169,15 @@ struct Add3Op {
 };
 
 template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void AddMulTwiceKernel(T* output, const T* pA, const T* pB,
-                                  const T* pC, CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC,
-                                  CUDA_LONG N, const TFunc func) {
+__global__ void AddMulTwiceKernel(T* output_data, const T* pA, const T* pB, const T* pC,
+                                  CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC, CUDA_LONG N,
+                                  const TFunc func) {
   CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
   CUDA_LONG id = start;
 #pragma unroll
   for (int i = 0; i < NumElementsPerThread; i++) {
     if (id < N) {
-      func(output_ab, pA[id % nA], pB[id % nB], pC[id % nC]);
+      func(output_data + id, pA[id % nA], pB[id % nB], pC[id % nC]);
       id += NumThreadsPerBlock;
     }
   }
@@ -206,36 +206,39 @@ cudaError_t _LaunchAddOrMulTwiceKernel(cudaStream_t stream,
             reinterpret_cast<TT*>(output),
             reinterpret_cast<const TT*>(pA), reinterpret_cast<const TT*>(pB), reinterpret_cast<const TT*>(pC),
             static_cast<CUDA_LONG>(countA), static_cast<CUDA_LONG>(countB), static_cast<CUDA_LONG>(countC),
-            static_cast<CUDA_LONG>(max_count), Add3SharedOp<TT>());
+            static_cast<CUDA_LONG>(max_count), Add3Op<TT>());
   } else {
     AddMulTwiceKernel<TT, Mul3Op<TT>, num_threads_per_block, num_elements_per_thread>
         <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
             reinterpret_cast<TT*>(output),
             reinterpret_cast<const TT*>(pA), reinterpret_cast<const TT*>(pB), reinterpret_cast<const TT*>(pC), static_cast<CUDA_LONG>(countA),
             static_cast<CUDA_LONG>(countB), static_cast<CUDA_LONG>(countC),
-            static_cast<CUDA_LONG>(max_count), Mul3SharedOp<TT>());
+            static_cast<CUDA_LONG>(max_count), Mul3Op<TT>());
   }
   return cudaGetLastError();
 }
 
 template <>
-cudaError_t LaunchAddOrMulSharedInputKernel<float>(cudaStream_t stream,
-                                                   const float* input_a, const float* input_b, const float* input_c,
-                                                   float* output,
-                                                   int64_t length_a, int64_t length_b, int64_t length_c, bool addition) {
-  return _LaunchAddOrMulSharedInputKernel(stream, input_a, input_b, input_c,
-                                          output,
-                                          length_a, length_b, length_c, addition);
+cudaError_t LaunchAddOrMulTwiceKernel<float>(cudaStream_t stream,
+                                             const float* input_a, const float* input_b, const float* input_c,
+                                             float* output,
+                                             int64_t length_a, int64_t length_b, int64_t length_c,
+                                             bool addition) {
+  return _LaunchAddOrMulTwiceKernel(stream, input_a, input_b, input_c,
+                                    output,
+                                    length_a, length_b, length_c, addition);
 }
 
 template <>
-cudaError_t LaunchAddOrMulSharedInputKernel<ortc::MFloat16>(cudaStream_t stream,
-                                                            const ortc::MFloat16* input_a, const ortc::MFloat16* input_b, const ortc::MFloat16* input_c,
-                                                            ortc::MFloat16* output,
-                                                            int64_t length_a, int64_t length_b, int64_t length_c, bool addition) {
-  return _LaunchAddOrMulSharedInputKernel(stream, input_a, input_b, input_c,
-                                          output,
-                                          length_a, length_b, length_c, addition);
+cudaError_t LaunchAddOrMulTwiceKernel<ortc::MFloat16>(cudaStream_t stream,
+                                                      const ortc::MFloat16* input_a, const ortc::MFloat16* input_b,
+                                                      const ortc::MFloat16* input_c,
+                                                      ortc::MFloat16* output,
+                                                      int64_t length_a, int64_t length_b, int64_t length_c,
+                                                      bool addition) {
+  return _LaunchAddOrMulTwiceKernel(stream, input_a, input_b, input_c,
+                                    output,
+                                    length_a, length_b, length_c, addition);
 }
 
 __device__ __forceinline__ void _addmul_op(float* address, const float a, const float b,
@@ -281,7 +284,7 @@ struct MulAdd {
 };
 
 template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void _AddAndMulKernel(T* output_data, const T* pA, const T* pB, const T* pC,
+__global__ void AddAndMulKernel(T* output_data, const T* pA, const T* pB, const T* pC,
                                  CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC, CUDA_LONG N,
                                  const TFunc func) {
   CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
@@ -296,7 +299,7 @@ __global__ void _AddAndMulKernel(T* output_data, const T* pA, const T* pB, const
 }
 
 template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void _AddAndMulSwitchMiddleAxesKernel(T* output_data, const T* pA, const T* pB,
+__global__ void AddAndMulSwitchMiddleAxesKernel(T* output_data, const T* pA, const T* pB,
                                                  const T* pC, CUDA_LONG nA, CUDA_LONG nB,
                                                  CUDA_LONG nC, CUDA_LONG N,
                                                  const TFunc func, CUDA_LONG d2,
@@ -337,23 +340,27 @@ cudaError_t _LaunchAddAndMulKernel(cudaStream_t stream,
   if (addition_first) {
     AddAndMulKernel<TT, AddMul<TT>, num_threads_per_block, num_elements_per_thread>
         <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-            cuda_stream,
             reinterpret_cast<TT*>(output),
             reinterpret_cast<const TT*>(pA),
             reinterpret_cast<const TT*>(pB),
             reinterpret_cast<const TT*>(pC),
-            countA, countB, countC,
-            max_size, AddMul<TT>());
+            static_cast<CUDA_LONG>(countA),
+            static_cast<CUDA_LONG>(countB),
+            static_cast<CUDA_LONG>(countC),
+            static_cast<CUDA_LONG>(max_count),
+            AddMul<TT>());
   } else {
     AddAndMulKernel<TT, MulAdd<TT>, num_threads_per_block, num_elements_per_thread>
         <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-            cuda_stream,
             reinterpret_cast<TT*>(output),
             reinterpret_cast<const TT*>(pA),
             reinterpret_cast<const TT*>(pB),
             reinterpret_cast<const TT*>(pC),
-            countA, countB, countC,
-            max_size, MulAdd<TT>());
+            static_cast<CUDA_LONG>(countA),
+            static_cast<CUDA_LONG>(countB),
+            static_cast<CUDA_LONG>(countC),
+            static_cast<CUDA_LONG>(max_count),
+            MulAdd<TT>());
   }
   return cudaGetLastError();
 }
@@ -361,8 +368,8 @@ cudaError_t _LaunchAddAndMulKernel(cudaStream_t stream,
 template <>
 cudaError_t LaunchAddAndMulKernel(cudaStream_t stream, const float* input_a, const float* input_b, const float* input_c,
                                   float* output, int64_t length_a, int64_t length_b, int64_t length_c,
-                                  bool addition) {
-  return _LaunchAddAndMulKernel(stream, pA, pB, pC, output, countA, countB, countC, addition_first);
+                                  bool addition_first) {
+  return _LaunchAddAndMulKernel(stream, input_a, input_b, input_c, output, length_a, length_b, length_c, addition_first);
 }
 
 template <>
@@ -370,8 +377,8 @@ cudaError_t LaunchAddAndMulKernel(cudaStream_t stream,
                                   const ortc::MFloat16* input_a, const ortc::MFloat16* input_b,
                                   const ortc::MFloat16* input_c,
                                   ortc::MFloat16* output, int64_t length_a, int64_t length_b, int64_t length_c,
-                                  bool addition) {
-  return _LaunchAddAndMulKernel(stream, pA, pB, pC, output, countA, countB, countC, addition_first);
+                                  bool addition_first) {
+  return _LaunchAddAndMulKernel(stream, input_a, input_b, input_c, output, length_a, length_b, length_c, addition_first);
 }
 
 template <typename T>
@@ -395,23 +402,27 @@ cudaError_t _LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream,
   if (addition_first) {
     AddAndMulSwitchMiddleAxesKernel<TT, AddMul<TT>, num_threads_per_block, num_elements_per_thread>
         <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-            cuda_stream,
             reinterpret_cast<TT*>(output),
             reinterpret_cast<const TT*>(pA),
             reinterpret_cast<const TT*>(pB),
             reinterpret_cast<const TT*>(pC),
-            countA, countB, countC,
-            max_size, AddMul<TT>());
+            static_cast<CUDA_LONG>(countA),
+            static_cast<CUDA_LONG>(countB),
+            static_cast<CUDA_LONG>(countC),
+            static_cast<CUDA_LONG>(max_count),
+            AddMul<TT>(), d2, d3, d4);
   } else {
     AddAndMulSwitchMiddleAxesKernel<TT, MulAdd<TT>, num_threads_per_block, num_elements_per_thread>
         <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-            cuda_stream,
             reinterpret_cast<TT*>(output),
             reinterpret_cast<const TT*>(pA),
             reinterpret_cast<const TT*>(pB),
             reinterpret_cast<const TT*>(pC),
-            countA, countB, countC,
-            max_size, MulAdd<TT>());
+            static_cast<CUDA_LONG>(countA),
+            static_cast<CUDA_LONG>(countB),
+            static_cast<CUDA_LONG>(countC),
+            static_cast<CUDA_LONG>(max_count),
+            MulAdd<TT>(), d2, d3, d4);
   }
   return cudaGetLastError();
 }
@@ -419,9 +430,9 @@ cudaError_t _LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream,
 template <>
 cudaError_t LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream, const float* input_a, const float* input_b, const float* input_c,
                                                   float* output, int64_t length_a, int64_t length_b, int64_t length_c,
-                                                  bool addition,
+                                                  bool addition_first,
                                                   int64_t d2, int64_t d3, int64_t d4) {
-  return _LaunchAddAndMulSwitchMiddleAxesKernel(stream, pA, pB, pC, output, countA, countB, countC,
+  return _LaunchAddAndMulSwitchMiddleAxesKernel(stream, input_a, input_b, input_c, output, length_a, length_b, length_c,
                                                 addition_first, d2, d3, d4);
 }
 
@@ -429,9 +440,9 @@ template <>
 cudaError_t LaunchAddAndMulSwitchMiddleAxesKernel(cudaStream_t stream, const ortc::MFloat16* input_a,
                                                   const ortc::MFloat16* input_b, const ortc::MFloat16* input_c,
                                                   ortc::MFloat16* output, int64_t length_a, int64_t length_b, int64_t length_c,
-                                                  bool addition,
+                                                  bool addition_first,
                                                   int64_t d2, int64_t d3, int64_t d4) {
-  return _LaunchAddAndMulSwitchMiddleAxesKernel(stream, pA, pB, pC, output, countA, countB, countC,
+  return _LaunchAddAndMulSwitchMiddleAxesKernel(stream, input_a, input_b, input_c, output, length_a, length_b, length_c,
                                                 addition_first, d2, d3, d4);
 }
 
@@ -519,27 +530,12 @@ struct MulSubNeg {
   }
 };
 
-template <typename T, typename TFunc, int NumThreadsPerBlock, int NumElementsPerThread>
-__global__ void _MulSubKernel(T* output_data, const T* pA, const T* pB, const T* pC,
-                              CUDA_LONG nA, CUDA_LONG nB, CUDA_LONG nC, CUDA_LONG N,
-                              const TFunc func) {
-  CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
-  CUDA_LONG id = start;
-#pragma unroll
-  for (int i = 0; i < NumElementsPerThread; i++) {
-    if (id < N) {
-      func(output_data + id, pA[id % nA], pB[id % nB], pC[id % nC]);
-      id += NumThreadsPerBlock;
-    }
-  }
-}
-
 template <typename T>
 cudaError_t _LaunchSubAndMulKernel(cudaStream_t stream,
                                    const T* pA, const T* pB, const T* pC,
                                    T* output,
                                    int64_t countA, int64_t countB, int64_t countC,
-                                   bool addition_first) {
+                                   bool addition_first, bool negative) {
   int64_t max_count = std::max(std::max(countA, countB), countC);
   if (max_count == 0)  // special case where there's a dim value of 0 in the output shape
     return cudaGetLastError();
@@ -554,47 +550,55 @@ cudaError_t _LaunchSubAndMulKernel(cudaStream_t stream,
 
   if (addition_first) {
     if (negative) {
-      SubAndMulKernel<TT, SubMul<TT>, num_threads_per_block, num_elements_per_thread>
+      AddAndMulKernel<TT, SubMulNeg<TT>, num_threads_per_block, num_elements_per_thread>
           <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-              cuda_stream,
               reinterpret_cast<TT*>(output),
               reinterpret_cast<const TT*>(pA),
               reinterpret_cast<const TT*>(pB),
               reinterpret_cast<const TT*>(pC),
-              countA, countB, countC,
-              max_size, SubMulNEg<TT>());
+              static_cast<CUDA_LONG>(countA),
+              static_cast<CUDA_LONG>(countB),
+              static_cast<CUDA_LONG>(countC),
+              static_cast<CUDA_LONG>(max_count),
+              SubMulNeg<TT>());
     } else {
-      SubAndMulKernel<TT, SubMul<TT>, num_threads_per_block, num_elements_per_thread>
+      AddAndMulKernel<TT, SubMul<TT>, num_threads_per_block, num_elements_per_thread>
           <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-              cuda_stream,
               reinterpret_cast<TT*>(output),
               reinterpret_cast<const TT*>(pA),
               reinterpret_cast<const TT*>(pB),
               reinterpret_cast<const TT*>(pC),
-              countA, countB, countC,
-              max_size, SubMul<TT>());
+              static_cast<CUDA_LONG>(countA),
+              static_cast<CUDA_LONG>(countB),
+              static_cast<CUDA_LONG>(countC),
+              static_cast<CUDA_LONG>(max_count),
+              SubMul<TT>());
     }
   } else {
     if (negative) {
-      SubAndMulKernel<TT, MulSub<TT>, num_threads_per_block, num_elements_per_thread>
+      AddAndMulKernel<TT, MulSubNeg<TT>, num_threads_per_block, num_elements_per_thread>
           <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-              cuda_stream,
               reinterpret_cast<TT*>(output),
               reinterpret_cast<const TT*>(pA),
               reinterpret_cast<const TT*>(pB),
               reinterpret_cast<const TT*>(pC),
-              countA, countB, countC,
-              max_size, MulSubNeg<TT>());
+              static_cast<CUDA_LONG>(countA),
+              static_cast<CUDA_LONG>(countB),
+              static_cast<CUDA_LONG>(countC),
+              static_cast<CUDA_LONG>(max_count),
+              MulSubNeg<TT>());
     } else {
-      SubAndMulKernel<TT, MulSub<TT>, num_threads_per_block, num_elements_per_thread>
+      AddAndMulKernel<TT, MulSub<TT>, num_threads_per_block, num_elements_per_thread>
           <<<blocksPerGrid, num_threads_per_block, 0, stream>>>(
-              cuda_stream,
               reinterpret_cast<TT*>(output),
               reinterpret_cast<const TT*>(pA),
               reinterpret_cast<const TT*>(pB),
               reinterpret_cast<const TT*>(pC),
-              countA, countB, countC,
-              max_size, MulSub<TT>());
+              static_cast<CUDA_LONG>(countA),
+              static_cast<CUDA_LONG>(countB),
+              static_cast<CUDA_LONG>(countC),
+              static_cast<CUDA_LONG>(max_count),
+              MulSub<TT>());
     }
   }
   return cudaGetLastError();
@@ -604,7 +608,7 @@ template <>
 cudaError_t LaunchSubAndMulKernel(cudaStream_t stream, const float* input_a, const float* input_b, const float* input_c,
                                   float* output, int64_t length_a, int64_t length_b, int64_t length_c,
                                   bool subtract_first, bool negative) {
-  return _LaunchSubAndMulKernel(stream, pA, pB, pC, output, countA, countB, countC, subtract_first, negative);
+  return _LaunchSubAndMulKernel(stream, input_a, input_b, input_c, output, length_a, length_b, length_c, subtract_first, negative);
 }
 
 template <>
@@ -612,6 +616,6 @@ cudaError_t LaunchSubAndMulKernel(cudaStream_t stream,
                                   const ortc::MFloat16* input_a, const ortc::MFloat16* input_b,
                                   const ortc::MFloat16* input_c,
                                   ortc::MFloat16* output, int64_t length_a, int64_t length_b, int64_t length_c,
-                                  bool subtract_first, negative) {
-  return _LaunchSubAndMulKernel(stream, pA, pB, pC, output, countA, countB, countC, subtract_first, negative);
+                                  bool subtract_first, bool negative) {
+  return _LaunchSubAndMulKernel(stream, input_a, input_b, input_c, output, length_a, length_b, length_c, subtract_first, negative);
 }
diff --git a/operators/cuda/cuda_ops.cc b/operators/cuda/cuda_ops.cc
index 21feec2e2..0587fc6f9 100644
--- a/operators/cuda/cuda_ops.cc
+++ b/operators/cuda/cuda_ops.cc
@@ -29,38 +29,38 @@ FxLoadCustomOpFactory LoadCustomOpClasses_Contrib = []() -> CustomOpArray& {
   using AddTwiceFloat16Type = typename contrib::AddOrMulTwice<ortc::MFloat16, true>;
   using MulTwiceFloat16Type = typename contrib::AddOrMulTwice<ortc::MFloat16, false>;
 
-  using AddAndMulFloat32Type = typename contrib::AddAndMul<ortc::MFloat16, true>;
-  using MulAndAddFloat32Type = typename contrib::AddAndMul<ortc::MFloat16, false>;
+  using AddAndMulFloat16Type = typename contrib::AddAndMul<ortc::MFloat16, true>;
+  using MulAndAddFloat16Type = typename contrib::AddAndMul<ortc::MFloat16, false>;
 
-  using SubAndMulFloat32Type = typename contrib::SubAndMul<ortc::MFloat16, true>;
-  using MulAndSubFloat32Type = typename contrib::SubAndMul<ortc::MFloat16, false>;
+  using SubAndMulFloat16Type = typename contrib::SubAndMul<ortc::MFloat16, true>;
+  using MulAndSubFloat16Type = typename contrib::SubAndMul<ortc::MFloat16, false>;
 #endif
 
   static OrtOpLoader op_loader(
       []() { return nullptr; }
 #ifdef USE_CUDA
       ,
+      CustomCudaStructV2("AddAdd", AddTwiceFloat32Type),
       CustomCudaStructV2("AddMul", AddAndMulFloat32Type),
       CustomCudaStructV2("AddSharedInput", AddSharedInputFloat32Type),
-      CustomCudaStructV2("AddTwice", AddTwiceFloat32Type),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<float>),
       CustomCudaStructV2("MulAdd", MulAndAddFloat32Type),
+      CustomCudaStructV2("MulMul", MulTwiceFloat32Type),
       CustomCudaStructV2("MulSharedInput", MulSharedInputFloat32Type),
       CustomCudaStructV2("MulSub", MulAndSubFloat32Type),
-      CustomCudaStructV2("MulTwice", MulTwiceFloat32Type),
       CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<float>),
       CustomCudaStructV2("SubMul", SubAndMulFloat32Type),
 #if ORT_API_VERSION >= 16
 
+      CustomCudaStructV2("AddAdd", AddTwiceFloat16Type),
       CustomCudaStructV2("AddMul", AddAndMulFloat16Type),
       CustomCudaStructV2("AddSharedInput", AddSharedInputFloat16Type),
-      CustomCudaStructV2("AddTwice", AddTwiceFloat16Type),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<ortc::MFloat16>),
       CustomCudaStructV2("FastGelu", contrib::FastGelu<ortc::BFloat16>),
       CustomCudaStructV2("MulAdd", MulAndAddFloat16Type),
+      CustomCudaStructV2("MulMul", MulTwiceFloat16Type),
       CustomCudaStructV2("MulSharedInput", MulSharedInputFloat16Type),
       CustomCudaStructV2("MulSub", MulAndSubFloat16Type),
-      CustomCudaStructV2("MulTwice", MulTwiceFloat16Type),
       CustomCudaStructV2("NegXPlus1", contrib::NegXPlus1<ortc::MFloat16>),
       CustomCudaStructV2("SubMul", SubAndMulFloat16Type)
 #endif
diff --git a/test/cuda/test_cudaops.py b/test/cuda/test_cudaops.py
index ca94646fc..ab7b1f4a3 100644
--- a/test/cuda/test_cudaops.py
+++ b/test/cuda/test_cudaops.py
@@ -71,8 +71,8 @@ def _addaddmulmul_cuda(self, itype, op_type, broad=False):
         shapey = (3, 2, 3)
         shapez = (1, 2, 3) if broad else (3, 2, 3)
         x = (np.arange(np.prod(shapex)) + 1).reshape(shapex).astype(dtype)
-        y = (np.arange(np.prod(shapey)) + 1).reshape(shapey).astype(dtype)
-        z = (np.arange(np.prod(shapez)) + 1).reshape(shapez).astype(dtype)
+        y = (np.arange(np.prod(shapey)) + 10).reshape(shapey).astype(dtype)
+        z = (np.arange(np.prod(shapez)) + 100).reshape(shapez).astype(dtype)
 
         feeds1 = dict(X=x, Y=y, Z=z)
         ref = ReferenceEvaluator(model1)
@@ -228,8 +228,6 @@ def test_cuda_negxplus1(self):
         self._negxplus1_cuda(TensorProto.FLOAT16)
 
     def _addmul_shared_input_cuda(self, itype, op_type, shapea=(3, 2, 3), shapeb=(3, 2, 3), shapec=(3, 2, 3)):
-        from ai.onnx.contrib import get_ort_ext_libs
-
         model1 = helper.make_model(
             helper.make_graph(
                 [
@@ -289,7 +287,7 @@ def _addmul_shared_input_cuda(self, itype, op_type, shapea=(3, 2, 3), shapeb=(3,
         expected = ref.run(None, feeds1)
 
         opts = _ort.SessionOptions()
-        opts.register_custom_ops_library(get_ort_ext_libs()[0])
+        opts.register_custom_ops_library(_get_library_path())
         sess = _ort.InferenceSession(model2.SerializeToString(), opts, providers=["CUDAExecutionProvider"])
         got = sess.run(None, feeds1)
         for i in range(2):
@@ -445,4 +443,4 @@ def test_mulsub_cuda_negative(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main(verbosity=2)

From 8af428aad56c13354358f236834faf8857202c0d Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Tue, 11 Jun 2024 09:40:59 +0000
Subject: [PATCH 5/7] more comments

---
 operators/cuda/add_mul.h | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/operators/cuda/add_mul.h b/operators/cuda/add_mul.h
index b51282f94..1aea73e1f 100644
--- a/operators/cuda/add_mul.h
+++ b/operators/cuda/add_mul.h
@@ -29,6 +29,14 @@ inline void _FillOutputShape3Op(std::vector<int64_t>& dimsA,
     }
 }
 
+/**
+*  AddOrMulSharedInput(A, B, C) = A + B, A + C ifaddition is true
+*  AddOrMulSharedInput(A, B, C) = A * B, A * C ifaddition is false
+*
+*  The operator supports broadcast on first dimensions.
+*  A[1, J] + B[I, J] is supported,
+*  A[1, J, 1] + B[I, J, K] is not supported,
+*/
 template <typename T, bool addition>
 struct AddOrMulSharedInput {
   template <typename TDict>
@@ -61,6 +69,14 @@ struct AddOrMulSharedInput {
   }
 };
 
+/**
+*  AddOrMulTwice(A, B, C) = A + B + C ifaddition is true
+*  AddOrMulTwice(A, B, C) = A * B * C ifaddition is false
+*
+*  The operator supports broadcast on first dimensions.
+*  A[1, J] + B[I, J] is supported,
+*  A[1, J, 1] + B[I, J, K] is not supported,
+*/
 template <typename T, bool addition>
 struct AddOrMulTwice {
   template <typename TDict>
@@ -97,6 +113,17 @@ struct AddOrMulTwice {
   }
 };
 
+/**
+*  AddAndMul(A, B, C) = (A + B) * C if addition_first is true
+*  AddAndMul(A, B, C) = A * B + C if addition_first is false
+*
+*  The operator supports broadcast on first dimensions.
+*  A[1, J] + B[I, J] is supported,
+*  A[1, J, 1] + B[I, J, K] is not supported,
+*
+* If switchMiddleAxis is true, then the output is transposed, then
+* AddAndMul(A, B, C, switchMiddleAxis=1) = Transpose((A + B) * C, perm=[0, 2, 1, 3])
+*/
 template <typename T, bool addition_first>
 struct AddAndMul {
   template <typename TDict>
@@ -154,6 +181,17 @@ struct AddAndMul {
   bool switchMiddelAxis_;
 };
 
+/**
+*  SubAndMul(A, B, C) = (A - B) * C if subtract_first is true
+*  SubAndMul(A, B, C) = A * B - C if subtract_first is false
+*
+*  The operator supports broadcast on first dimensions.
+*  A[1, J] + B[I, J] is supported,
+*  A[1, J, 1] + B[I, J, K] is not supported,
+*
+* If negative is true, then the output is transposed, then
+* SubAndMul(A, B, C, negative=1) = (B - A) * C
+*/
 template <typename T, bool subtract_first>
 struct SubAndMul {
   template <typename TDict>

From 3bf3cc926a97b3615406e5c6ad55ee7a12402985 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Wed, 12 Jun 2024 10:44:05 +0200
Subject: [PATCH 6/7] Update operators/cuda/add_mul.h

Co-authored-by: Wei-Sheng Chin <wechi@microsoft.com>
---
 operators/cuda/add_mul.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/operators/cuda/add_mul.h b/operators/cuda/add_mul.h
index 1aea73e1f..fd9f3397a 100644
--- a/operators/cuda/add_mul.h
+++ b/operators/cuda/add_mul.h
@@ -36,6 +36,7 @@ inline void _FillOutputShape3Op(std::vector<int64_t>& dimsA,
 *  The operator supports broadcast on first dimensions.
 *  A[1, J] + B[I, J] is supported,
 *  A[1, J, 1] + B[I, J, K] is not supported,
+*  In all other cases, all tensors must have the same shape.
 */
 template <typename T, bool addition>
 struct AddOrMulSharedInput {

From 6710e81f973a52aeae586f3ebc6c901296b8046a Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Thu, 27 Jun 2024 09:51:07 +0200
Subject: [PATCH 7/7] fix documentation

---
 operators/cuda/add_mul.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operators/cuda/add_mul.h b/operators/cuda/add_mul.h
index fd9f3397a..badac483e 100644
--- a/operators/cuda/add_mul.h
+++ b/operators/cuda/add_mul.h
@@ -190,7 +190,7 @@ struct AddAndMul {
 *  A[1, J] + B[I, J] is supported,
 *  A[1, J, 1] + B[I, J, K] is not supported,
 *
-* If negative is true, then the output is transposed, then
+* If negative is true, then
 * SubAndMul(A, B, C, negative=1) = (B - A) * C
 */
 template <typename T, bool subtract_first>