refactor(activation): change fused_add_rms_norm and fused_add_rms_norm_backward to out-place operations

Browse files

Files changed (6) hide show

activation/fused_add_rms_norm.cu +26 -21
tests/test_fused_add_rms_norm.py +1 -1
torch-ext/activation/rms_norm.py +32 -12
torch-ext/activation/rms_norm_meta.py +2 -10
torch-ext/torch_binding.cpp +5 -8
torch-ext/torch_binding.h +8 -10

activation/fused_add_rms_norm.cu CHANGED Viewed

@@ -295,20 +295,19 @@ __global__ std::enable_if_t<(width == 0)> fused_add_rms_norm_backward_kernel(
                 weight.data_ptr<scalar_t>(), eps, d);                          \
       });
-void fused_add_rms_norm(torch::Tensor &out,            // [..., d]
-                        torch::Tensor &add_out,        // [..., d]
-                        const torch::Tensor &input,    // [..., d]
-                        const torch::Tensor &residual, // [..., d]
-                        const torch::Tensor &weight,   // [d]
-                        double eps) {
   AssertTensorShapeEqual(input, residual, "input", "residual");
-  AssertTensorShapeEqual(input, out, "input", "out");
-  AssertTensorShapeEqual(input, add_out, "input", "result");
   AssertTensorNotNull(weight, "weight");
   // TODO shape check
-  AssertTensorContiguous(out, "out");
-  AssertTensorContiguous(add_out, "add_out");
   AssertTensorContiguous(input, "input");
   AssertTensorContiguous(residual, "residual");
   AssertTensorContiguous(weight, "weight");
@@ -326,6 +325,8 @@ void fused_add_rms_norm(torch::Tensor &out,            // [..., d]
   } else {
     LAUNCH_FUSED_ADD_RMS_NORM(0);
   }
 }
 #define LAUNCH_FUSED_ADD_RMS_NORM_BWD(width)                                   \
@@ -340,22 +341,24 @@ void fused_add_rms_norm(torch::Tensor &out,            // [..., d]
                                          weight.data_ptr<scalar_t>(), eps, d); \
       });
-void fused_add_rms_norm_backward(
-    torch::Tensor &input_grad,            // [..., d]
-    torch::Tensor &weight_grad,           // [d]
-    const torch::Tensor &output_grad,     // [..., d]
-    const torch::Tensor &add_output_grad, // [..., d]
-    const torch::Tensor &input,           // [..., d]
-    const torch::Tensor &weight,          // [d]
-    double eps) {
-  AssertTensorShapeEqual(input, input_grad, "input", "input_grad");
   AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
   AssertTensorShapeEqual(input, output_grad, "input", "add_output_grad");
   AssertTensorNotNull(weight, "weight");
   constexpr bool ALLOW_NULL = true;
-  AssertTensorContiguous(input_grad, "input_grad", ALLOW_NULL);
-  AssertTensorContiguous(weight_grad, "weight_grad", ALLOW_NULL);
   AssertTensorContiguous(output_grad, "output_grad");
   AssertTensorContiguous(add_output_grad, "add_output_grad");
   AssertTensorContiguous(input, "input");
@@ -386,4 +389,6 @@ void fused_add_rms_norm_backward(
     at::sum_out(acc, temp_weight_grad, {0});
     weight_grad.copy_(acc);
   }
 }

                 weight.data_ptr<scalar_t>(), eps, d);                          \
       });
+std::tuple<torch::Tensor, torch::Tensor>
+fused_add_rms_norm(const torch::Tensor &input,    // [..., d]
+                   const torch::Tensor &residual, // [..., d]
+                   const torch::Tensor &weight,   // [d]
+                   double eps) {
+  torch::Tensor out = torch::empty_like(input);
+  torch::Tensor add_out = torch::empty_like(input);
   AssertTensorShapeEqual(input, residual, "input", "residual");
   AssertTensorNotNull(weight, "weight");
   // TODO shape check
   AssertTensorContiguous(input, "input");
   AssertTensorContiguous(residual, "residual");
   AssertTensorContiguous(weight, "weight");
   } else {
     LAUNCH_FUSED_ADD_RMS_NORM(0);
   }
+  return {out, add_out};
 }
 #define LAUNCH_FUSED_ADD_RMS_NORM_BWD(width)                                   \
                                          weight.data_ptr<scalar_t>(), eps, d); \
       });
+std::tuple<torch::Tensor, torch::Tensor>
+fused_add_rms_norm_backward(const torch::Tensor &output_grad,     // [..., d]
+                            const torch::Tensor &add_output_grad, // [..., d]
+                            const torch::Tensor &input,           // [..., d]
+                            const torch::Tensor &weight,          // [d]
+                            double eps, bool need_input_grad) {
+  torch::Tensor input_grad;
+  if (need_input_grad) {
+    input_grad = torch::empty_like(input);
+  }
+  torch::Tensor weight_grad = torch::empty_like(weight);
   AssertTensorShapeEqual(input, output_grad, "input", "output_grad");
   AssertTensorShapeEqual(input, output_grad, "input", "add_output_grad");
   AssertTensorNotNull(weight, "weight");
   constexpr bool ALLOW_NULL = true;
   AssertTensorContiguous(output_grad, "output_grad");
   AssertTensorContiguous(add_output_grad, "add_output_grad");
   AssertTensorContiguous(input, "input");
     at::sum_out(acc, temp_weight_grad, {0});
     weight_grad.copy_(acc);
   }
+  return {input_grad, weight_grad};
 }

tests/test_fused_add_rms_norm.py CHANGED Viewed

@@ -81,7 +81,7 @@ def test_fused_add_rms_norm(
     out = torch.empty(x.shape, dtype=x.dtype, device=x.device)
     add_out = torch.empty(x.shape, dtype=x.dtype, device=x.device)
-    opcheck(op, (out, add_out, x, residual, weight, eps))
     out = fn(x, residual, weight, eps)
     mod_out, mod_a_out = layer(x, residual)

     out = torch.empty(x.shape, dtype=x.dtype, device=x.device)
     add_out = torch.empty(x.shape, dtype=x.dtype, device=x.device)
+    opcheck(op, (x, residual, weight, eps))
     out = fn(x, residual, weight, eps)
     mod_out, mod_a_out = layer(x, residual)

torch-ext/activation/rms_norm.py CHANGED Viewed

@@ -38,10 +38,8 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
     # Note that forward, setup_context, and backward are @staticmethods
     @staticmethod
     def forward(input, residual, weight, eps):
-        output = torch.empty_like(input)
-        add_output = torch.empty_like(input)
-        ops.fused_add_rms_norm(output, add_output, input, residual, weight,
-                               eps)
         return output, add_output
     @staticmethod
@@ -61,20 +59,42 @@ class FusedAddRMSNormFunction(torch.autograd.Function):
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
-        grad = torch.empty_like(output_grad) if need_in or need_res else None
-        weight_grad = torch.empty_like(
-            weight) if ctx.needs_input_grad[2] else None
-        ops.fused_add_rms_norm_backward(grad, weight_grad, output_grad,
-                                        add_output_grad, add_output, weight,
-                                        eps)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None
         return input_grad, residual_grad, weight_grad, None
 if version.parse(torch.__version__) >= version.parse("2.8"):
     from .rms_norm_meta import register_rms_norm_meta
     register_rms_norm_meta()

     # Note that forward, setup_context, and backward are @staticmethods
     @staticmethod
     def forward(input, residual, weight, eps):
+        output, add_output = ops.fused_add_rms_norm(input, residual, weight,
+                                                    eps)
         return output, add_output
     @staticmethod
         need_in = ctx.needs_input_grad[0]
         need_res = ctx.needs_input_grad[1]
+        grad, weight_grad = ops.fused_add_rms_norm_backward(
+            output_grad,
+            add_output_grad,
+            add_output,
+            weight,
+            eps,
+            need_input_grad=need_in or need_res)
         input_grad = grad if need_in else None
         residual_grad = grad if need_res else None
         return input_grad, residual_grad, weight_grad, None
+@torch.library.register_fake(ops.rms_norm.default)
+def rms_norm_abstract(x, weight, eps):
+    return torch.empty_like(x)
+@torch.library.register_fake(ops.rms_norm_backward.default)
+def rms_norm_backward_abstract(output_grad, x, weight, eps):
+    return torch.empty_like(x), torch.empty_like(weight)
+@torch.library.register_fake(ops.fused_add_rms_norm.default)
+def fused_add_rms_norm_abstract(x, residual, weight, eps):
+    return torch.empty_like(x), torch.empty_like(x)
+@torch.library.register_fake(ops.fused_add_rms_norm_backward.default)
+def fused_add_rms_norm_backward_abstract(output_grad, add_output_grad,
+                                         add_output, weight, eps,
+                                         need_input_grad: bool):
+    return torch.empty_like(x) if need_input_grad else None, torch.empty_like(
+        weight)
 if version.parse(torch.__version__) >= version.parse("2.8"):
     from .rms_norm_meta import register_rms_norm_meta
     register_rms_norm_meta()

torch-ext/activation/rms_norm_meta.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import torch
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor._op_schema import (OpSchema, OpSpec, OpStrategy,
@@ -17,16 +19,6 @@ def register_rms_norm_meta():
     pass
-@torch.library.register_fake(ops.rms_norm.default)
-def rms_norm_abstract(x, weight, eps):
-    return torch.empty_like(x)
-@torch.library.register_fake(ops.rms_norm_backward.default)
-def rms_norm_backward_abstract(output_grad, x, weight, eps):
-    return torch.empty_like(x), torch.empty_like(weight)
 def _replicate_dims_start_at(placements: Sequence[Placement],
                              start_dim: int = 0) -> tuple[Placement, ...]:
     new_placements: list[Placement] = []

+from collections.abc import Sequence
 import torch
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor._op_schema import (OpSchema, OpSpec, OpStrategy,
     pass
 def _replicate_dims_start_at(placements: Sequence[Placement],
                              start_dim: int = 0) -> tuple[Placement, ...]:
     new_placements: list[Placement] = []

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -38,17 +38,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
            &fused_mul_poly_norm_backward);
   // fused_add_rms_norm
-  ops.def(
-      "fused_add_rms_norm(Tensor! out, Tensor! add_out, Tensor input, Tensor "
-      "residual, Tensor "
-      "weight, float eps) -> ()");
   ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
   ops.def(
-      "fused_add_rms_norm_backward(Tensor! input_grad, Tensor! weight_grad, "
-      "Tensor "
-      "output_grad, Tensor add_output_grad, Tensor input, Tensor weight, float "
-      "eps) -> ()");
   ops.impl("fused_add_rms_norm_backward", torch::kCUDA,
            &fused_add_rms_norm_backward);
 }

            &fused_mul_poly_norm_backward);
   // fused_add_rms_norm
+  ops.def("fused_add_rms_norm(Tensor input, Tensor residual, Tensor "
+          "weight, float eps) -> (Tensor, Tensor)");
   ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
   ops.def(
+      "fused_add_rms_norm_backward(Tensor output_grad, Tensor add_output_grad,"
+      "Tensor input, Tensor weight, float eps, bool need_input_grad) -> "
+      "(Tensor, Tensor)");
   ops.impl("fused_add_rms_norm_backward", torch::kCUDA,
            &fused_add_rms_norm_backward);
 }

torch-ext/torch_binding.h CHANGED Viewed

@@ -27,13 +27,11 @@ void fused_mul_poly_norm_backward(
     const torch::Tensor &mul, const torch::Tensor &weight,
     const torch::Tensor &bias, double eps);
-void fused_add_rms_norm(torch::Tensor &out, torch::Tensor &add_out,
-                        const torch::Tensor &input,
-                        const torch::Tensor &residual,
-                        const torch::Tensor &weight, double eps);
-void fused_add_rms_norm_backward(torch::Tensor &input_grad,
-                                 torch::Tensor &weight_grad,
-                                 const torch::Tensor &output_grad,
-                                 const torch::Tensor &add_output_grad,
-                                 const torch::Tensor &input,
-                                 const torch::Tensor &weight, double eps);

     const torch::Tensor &mul, const torch::Tensor &weight,
     const torch::Tensor &bias, double eps);
+std::tuple<torch::Tensor, torch::Tensor>
+fused_add_rms_norm(const torch::Tensor &input, const torch::Tensor &residual,
+                   const torch::Tensor &weight, double eps);
+std::tuple<torch::Tensor, torch::Tensor> fused_add_rms_norm_backward(
+    const torch::Tensor &output_grad, const torch::Tensor &add_output_grad,
+    const torch::Tensor &input, const torch::Tensor &weight, double eps,
+    bool need_input_grad);