apply all2all scatter gather

Browse files

Files changed (4) hide show

test/test_muon/muon.py +0 -1
test/test_muon/optimizer +1 -0
torch-ext/optimizer/matmul_transpose_triton.py +106 -0
torch-ext/optimizer/muon.py +261 -93

test/test_muon/muon.py DELETED Viewed

	@@ -1 +0,0 @@
1	- ../../torch-ext/optimizer/muon.py

test/test_muon/optimizer ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../torch-ext/optimizer/

torch-ext/optimizer/matmul_transpose_triton.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import torch
+import triton
+import triton.language as tl
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': blk_m,
+                'BLOCK_SIZE_K': blk_k,
+                'GROUP_SIZE_M': grp_sz
+            },
+            num_stages=n_stages,
+            num_warps=n_warps) for blk_m in [32, 64, 128]
+        for blk_k in [32, 64] for grp_sz in [8] for n_stages in [3, 4, 5]
+        for n_warps in [4, 8]
+    ]
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'K'],
+)
+@triton.jit
+def mmt_kernel(x, y, M, K, stride_xm, stride_xk, stride_ym, stride_yn,
+               BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+               GROUP_SIZE_M: tl.constexpr):
+    """
+    Core kernel jit function of matmul_transpose that computes y = x @ x.T
+    The code is a simple adaptation from the triton `matmul` tutorial:
+    https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    if pid_m > pid_n:
+        return
+    offs_xm = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_xn = (pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    # we use a & b ptrs to denote different rows of x.
+    a_ptrs = x + (offs_xm[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    b_ptrs = x + (offs_xn[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_M), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[None, :] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        accumulator = tl.dot(a, tl.permute(b, (1, 0)), accumulator)
+        a_ptrs += BLOCK_SIZE_K * stride_xk
+        b_ptrs += BLOCK_SIZE_K * stride_xk
+    # use dtype.element_ty to accommodate different input datatypes as in cpp templates
+    # https://github.com/triton-lang/triton/issues/2252
+    c = accumulator.to(x.dtype.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = y + stride_ym * offs_cm[:, None] + stride_yn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < M)
+    tl.store(c_ptrs, c, mask=c_mask)
+    # transpose and copy
+    if pid_m < pid_n:
+        ct_ptrs = y + stride_ym * offs_cn[:,
+                                          None] + stride_yn * offs_cm[None, :]
+        ct_mask = (offs_cn[:, None] < M) & (offs_cm[None, :] < M)
+        tl.store(ct_ptrs, tl.permute(c, (1, 0)), mask=ct_mask)
+def matmul_transpose_assign(d_in, d_out):
+    assert d_in.is_cuda, "Input `d_in` must be a CUDA tensor"
+    assert d_out.is_cuda, "Input `d_out` must be a CUDA tensor"
+    assert d_in.device == d_out.device, "Inputs `d_in` and `d_out` must be on the same CUDA device"
+    assert d_in.dtype == d_out.dtype, "Inputs must have the same data type"
+    assert d_in.ndim == 2, "Input `d_in` must be a 2D tensor"
+    assert d_out.ndim == 2, "Input `d_out` must be a 2D tensor"
+    assert d_in.size(0) == d_out.size(0) == d_out.size(0), \
+            "First dimension of `d_in` must match first and second dimension of `d_out`"
+    d_in = d_in.contiguous()
+    M, K = d_in.shape
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        M, META['BLOCK_SIZE_M']), )
+    with torch.cuda.device(d_in.device.index):
+        mmt_kernel[grid](d_in, d_out, M, K, d_in.stride(0), d_in.stride(1),
+                         d_out.stride(0), d_out.stride(1))
+def matmul_transpose(d_in):
+    M, _ = d_in.shape
+    d_out = torch.empty((M, M), device=d_in.device, dtype=d_in.dtype)
+    matmul_transpose_assign(d_in, d_out)
+    return d_out

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -8,6 +8,8 @@ import torch
 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
 logger = logging.getLogger(__name__)
@@ -16,6 +18,7 @@ logger = logging.getLogger(__name__)
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -28,12 +31,15 @@ def _zeropower_via_newtonschulz5(G, steps):
     """
     assert len(G.shape) == 2
     assert G.dtype == torch.bfloat16
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
@@ -42,16 +48,14 @@ def _zeropower_via_newtonschulz5(G, steps):
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
-        A = X @ X.T
-        # B = (
-        #    b * A + c * A @ A
-        # )
-        B = torch.addmm(A, A, A, alpha=c, beta=b)
-        # X = a * X + B @ X
-        X = torch.addmm(X, B, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
     return X
@@ -69,51 +73,130 @@ class _muon_state:
     qk_clip_state = None
 @torch.no_grad()
-def _gather(p, state, rank, comm_stream, none_grad):
-    """
-    Gather the gradients to worker_rank.
-    If none_grad is True, free p.grad after the gather.
-    """
     with torch.cuda.stream(comm_stream):
-        g = p.grad
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            gather_list = [
-                torch.empty_like(g.to_local(), dtype=torch.bfloat16)
-                for _ in range(num_ranks)
-            ]
-        else:
-            gather_list = None
-        g = g.to(torch.bfloat16)
-        torch.distributed.gather(
-            g.to_local(),
-            dst=state.worker_rank,
-            gather_list=gather_list,
-            group=state.process_group,
         )
-        if rank == state.worker_rank:
-            if state.gathered_grad is not None:
-                raise RuntimeError(
-                    "Gather event already exists, which should not happen.")
-            state.gathered_grad = torch.cat(gather_list, dim=0)
-            state.gather_event = torch.cuda.Event()
-            state.gather_event.record()
-        else:
-            state.gathered_grad = None
-            state.gather_event = None
-        gather_list = None
-        if none_grad:
-            # We can safely free p.grad without calling record_stream:
-            #   p.grad.to_local().record_stream(comm_stream)
-            # Explanation:
-            # 1. p.grad is created on the default stream, but the default stream
-            #    is synchronized with the comm stream later.
-            # 2. There is no further activity on the default stream before the optimizer finishes.
-            # Therefore, it is safe to free p.grad directly on the comm stream.
-            p.grad = None
 @torch.no_grad()
@@ -127,45 +210,120 @@ def _compute_u(p, state, steps, rank, compute_stream):
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
             state.computed_u = u
-        state.scattered_u = torch.empty_like(p.to_local(),
-                                             dtype=torch.bfloat16)
-        state.compute_event = torch.cuda.Event()
-        state.compute_event.record()
-        u = None
 @torch.no_grad()
-def _scatter(p, state, rank, comm_stream):
-    """
-    Scatter the computed_u from worker_rank to all ranks.
-    """
     with torch.cuda.stream(comm_stream):
-        if state.compute_event is None:
-            raise RuntimeError("Compute event must be set before scatter.")
-        comm_stream.wait_event(state.compute_event)
-        if rank == state.worker_rank:
-            num_ranks = dist.get_world_size(group=state.process_group)
-            # Clear the gathered gradient to free memory
-            state.gathered_grad = None
-            u = state.computed_u
-            scatter_list = list(torch.split(u, p.size(0) // num_ranks, dim=0))
-            scatter_list = [s.contiguous() for s in scatter_list]
-        else:
-            scatter_list = None
-        torch.distributed.scatter(
-            state.scattered_u,
-            scatter_list=scatter_list,
-            src=state.worker_rank,
-            group=state.process_group,
         )
-        state.scatter_event = torch.cuda.Event()
-        state.scatter_event.record()
-        scatter_list = None
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
@@ -585,11 +743,15 @@ class Muon(torch.optim.Optimizer):
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
-        def enqueue_gathers(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _gather(p, state, self.rank, self.comm_stream,
-                        group["none_grad"])
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -597,10 +759,14 @@ class Muon(torch.optim.Optimizer):
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
-        def enqueue_scatters(start_idx, chunk_size):
-            for p in ordered_params[start_idx:start_idx + chunk_size]:
-                state = param_to_state[id(p)]
-                _scatter(p, state, self.rank, self.comm_stream)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -615,14 +781,16 @@ class Muon(torch.optim.Optimizer):
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
-        enqueue_gathers(0, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
-            enqueue_computes(i, chunk_size)
-            if i > 0:
-                enqueue_update_param(i - chunk_size, chunk_size)
-            enqueue_gathers(i + chunk_size, chunk_size)
-            enqueue_scatters(i, chunk_size)
-        enqueue_update_param(i, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)

 import torch.distributed as dist
 from torch.distributed._tensor import DTensor, Replicate, Shard
+from .matmul_transpose_triton import matmul_transpose_assign
 logger = logging.getLogger(__name__)
 # Muon's Newton–Schulz iteration causes high variance in singular values
 # Idea: give each iteration its own 3 coefficients and optimize them via gradient descent.
 @torch.no_grad()
+# matmul_transpose_assign from : https://github.com/nil0x9/flash-muon
 def _zeropower_via_newtonschulz5(G, steps):
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
     """
     assert len(G.shape) == 2
     assert G.dtype == torch.bfloat16
+    G = G.to(thorch.float32)
     X = G  # no manual typecast
     if G.size(0) > G.size(1):
         X = X.T
     # Ensure spectral norm is at most 1
     X = X / (X.norm() + 1e-7)
+    buf1 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
+    buf2 = torch.empty(X.size(0), X.size(0), dtype=X.dtype, device=X.device)
     # Perform the NS iterations
     for a, b, c in [
         (4.0848, -6.8946, 2.9270),
         (2.8769, -3.1427, 1.2046),
         (2.8366, -3.0525, 1.2012),
     ]:
+        matmul_transpose_assign(X, buf1)
+        matmul_transpose_assign(buf1, buf2)
+        buf1.mul_(b).add_(buf2, alpha=c)
+        X = torch.addmm(X, buf1, X, alpha=1.0, beta=a)
     if G.size(0) > G.size(1):
         X = X.T
+    X = X.to(torch.bfloat16)
     return X
     qk_clip_state = None
+def split_elems_for_src(param, state, src_rank, num_ranks) -> int:
+    rows = param.shape[0]
+    cols = int(param.numel() // rows)
+    base, rem = divmod(rows, num_ranks)
+    my_rows = base + (1 if src_rank < rem else 0)
+    return my_rows * cols
 @torch.no_grad()
+def _alloc_gathered_grad(params, param_to_state, rank, compute_stream):
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            if rank == state.worker_rank:
+                num_ranks = dist.get_world_size(group=state.process_group)
+                state.gathered_grad = torch.empty(p.grad.numel(),
+                                                  dtype=torch.bfloat16,
+                                                  device="cuda")
+            else:
+                state.gathered_grad = None
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+@torch.no_grad()
+def _all2all_gather(params, param_to_state, rank, comm_stream, none_grad,
+                    alloc_event):
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        # Calculate sending tensors
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        for p in params:
+            state = param_to_state[id(p)]
+            dst = state.worker_rank
+            shard_elems = split_elems_for_src(p, state, rank, num_ranks)
+            g = p.grad
+            g = g.to_local().to(torch.bfloat16).contiguous().view(-1)
+            assert g.numel() == shard_elems
+            per_dst[dst].append(g)
+            send_counts[dst] += shard_elems
+        assert all(
+            len(v) > 0
+            for v in per_dst), "all params should be sharded to all devices"
+        send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
+        owner_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        # Calculate receiving tensors
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in owner_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                total += split_elems_for_src(p, state, src, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        recv_buf = torch.empty(recv_total, dtype=torch.bfloat16, device="cuda")
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        # Reconstructs gathered grad from the received buffer
+        #
+        #                  recv_buf (num ranks = 3)
+        #
+        #      From rank 0        From rank 1        From rank 2
+        # | p1_0, p2_0, p3_0 | p1_1, p2_1, p3_1 | p1_2, p2_2, p3_2 |
+        #
+        # Outer loop:
+        # rank 0 -> rank 1 -> rank2
+        #
+        # Inner loop:
+        # p1_n -> p2_n -> p3_n
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        write_offsets = {id(p): 0 for p in owner_params}
+        for src in range(num_ranks):
+            if recv_counts[src] == 0:
+                continue
+            block = recv_counts[src]
+            inner_off = 0
+            for p in owner_params:
+                state = param_to_state[id(p)]
+                assert state.worker_rank == rank
+                n = split_elems_for_src(p, state, src, num_ranks)
+                assert n > 0
+                sg = recv_buf.narrow(0, off + inner_off, n)
+                woff = write_offsets[id(p)]
+                dst = state.gathered_grad.narrow(0, woff, n)
+                dst.copy_(sg)
+                write_offsets[id(p)] += n
+                inner_off += n
+            off += block
+        for p in params:
+            state = param_to_state[id(p)]
+            if state.worker_rank == rank:
+                state.gathered_grad = state.gathered_grad.view_as(p)
+                state.gather_event = torch.cuda.Event()
+                state.gather_event.record(comm_stream)
+            else:
+                state.gathered_grad = None
+                state.gather_event = None
+            if none_grad:
+                p.grad = None
 @torch.no_grad()
                 raise RuntimeError("Gather event must be set before compute.")
             compute_stream.wait_event(state.gather_event)
             u = _zeropower_via_newtonschulz5(state.gathered_grad, steps)
+            state.gathered_grad = None
             state.computed_u = u
+            state.compute_event = torch.cuda.Event()
+            state.compute_event.record()
+        else:
+            state.computed_u = None
+            state.compute_event = None
 @torch.no_grad()
+def _alloc_scattered_u(params, param_to_state, rank, compute_stream):
+    with torch.cuda.stream(compute_stream):
+        for p in params:
+            state = param_to_state[id(p)]
+            state.scattered_u = torch.empty_like(p.to_local(),
+                                                 dtype=torch.bfloat16)
+        alloc_event = torch.cuda.Event()
+        alloc_event.record(compute_stream)
+        return alloc_event
+def _all2all_scatter(params, param_to_state, rank, comm_stream, alloc_event):
     with torch.cuda.stream(comm_stream):
+        process_group = param_to_state[id(params[0])].process_group
+        num_ranks = dist.get_world_size(group=process_group)
+        owner_params = [
+            p for p in params if param_to_state[id(p)].worker_rank == rank
+        ]
+        per_dst = [[] for _ in range(num_ranks)]
+        send_counts = [0] * num_ranks
+        if owner_params:
+            for p in owner_params:
+                state = param_to_state[id(p)]
+                if state.compute_event is None:
+                    raise RuntimeError(
+                        "Compute event must be set before scatter.")
+                comm_stream.wait_event(state.compute_event)
+                state.gathered_grad = None
+                assert state.computed_u is not None
+                u_full = state.computed_u.to(
+                    torch.bfloat16).contiguous().view(-1)
+                offset = 0
+                for dst in range(num_ranks):
+                    n = split_elems_for_src(p, state, dst, num_ranks)
+                    assert n > 0
+                    su = u_full.narrow(0, offset, n)
+                    per_dst[dst].append(su)
+                    send_counts[dst] += n
+                    offset += n
+                assert offset == u_full.numel()
+        if any(len(v) > 0 for v in per_dst):
+            send_buf = torch.cat([torch.cat(v, dim=0) for v in per_dst], dim=0)
+        else:
+            # all_to_all requires participation from all ranks
+            # Even non-owner ranks must join the collective call
+            send_buf = torch.empty(0, dtype=torch.bfloat16, device="cuda")
+        recv_counts = [0] * num_ranks
+        for src in range(num_ranks):
+            total = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                total += split_elems_for_src(p, state, rank, num_ranks)
+            recv_counts[src] = total
+        recv_total = sum(recv_counts)
+        assert recv_total > 0
+        recv_buf = torch.empty(recv_total, dtype=torch.bfloat16, device="cuda")
+        dist.all_to_all_single(
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts,
+            input_split_sizes=send_counts,
+            group=process_group,
         )
+        comm_stream.wait_event(alloc_event)
+        off = 0
+        for src in range(num_ranks):
+            block = recv_counts[src]
+            if block == 0:
+                continue
+            inner_off = 0
+            for p in params:
+                state = param_to_state[id(p)]
+                if state.worker_rank != src:
+                    continue
+                n = split_elems_for_src(p, state, rank, num_ranks)
+                assert n > 0
+                flat_local = recv_buf.narrow(0, off + inner_off,
+                                             n).view_as(p.to_local())
+                state.scattered_u.copy_(flat_local)
+                state.scatter_event = torch.cuda.Event()
+                state.scatter_event.record(comm_stream)
+                inner_off += n
+            assert inner_off == block
+            off += block
 def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         param_to_state, ordered_params = self.init_state_and_assign_params(
             names, params, group, qk_logits)
+        def enqueue_all2all_gather(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_gathered_grad(target_params,
+                                                   param_to_state, self.rank,
+                                                   self.compute_stream)
+                _all2all_gather(target_params, param_to_state, self.rank,
+                                self.comm_stream, group["none_grad"],
+                                alloc_event)
         def enqueue_computes(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 _compute_u(p, state, group["ns_steps"], self.rank,
                            self.compute_stream)
+        def enqueue_all2all_scatter(start_idx, chunk_size):
+            target_params = ordered_params[start_idx:start_idx + chunk_size]
+            if target_params:
+                alloc_event = _alloc_scattered_u(target_params, param_to_state,
+                                                 self.rank,
+                                                 self.compute_stream)
+                _all2all_scatter(target_params, param_to_state, self.rank,
+                                 self.comm_stream, alloc_event)
         def enqueue_update_param(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
         # Wait grad update
         self.comm_stream.wait_stream(torch.cuda.current_stream())
+        PRE_STEP = 5
+        for i in range(0, PRE_STEP):
+            enqueue_all2all_gather(i * chunk_size, chunk_size)
+            enqueue_computes(i * chunk_size, chunk_size)
         for i in range(0, len(params) + chunk_size - 1, chunk_size):
+            enqueue_all2all_scatter(i, chunk_size)
+            enqueue_all2all_gather(i + PRE_STEP * chunk_size, chunk_size)
+            enqueue_update_param(i, chunk_size)
+            enqueue_computes(i + PRE_STEP * chunk_size, chunk_size)
         # Wait the last update_param to finish
         torch.cuda.current_stream().wait_stream(self.compute_stream)