dongseokmotif dongseokmotif github-actions[bot] commited on Sep 24

Commit

d65066c

unverified ·

1 Parent(s): 8997e30

feat(muon_clip) : add muon clip (#6)

* feat(muon_clip) : add muon clip

* fix(muon_clip): delete comment

* fix(muon_clip): delete comment

* fix(muon_clip): considering when nkvheadgroup>1

* docs(muon_clip): refine __init__ docstring and add clip_info argument description

* refactor(muon_clip): refactor clip info using dataclass

* fix(muon_clip): change min -> new_scaling compare

* test(muon): add qk_clip=False case to model comparison

* test(muon): show results

* fix(muon_clip): change default is muon func

* Add built binary [ci skip]

---------

Co-authored-by: dongseokmotif <[email protected]>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

Files changed (29) hide show

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_4043ece_dirty.abi3.so → _optimizer_9c21645_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py +198 -39
build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_4043ece_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py +198 -39
build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_4043ece_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so} +1 -1
build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py +198 -39
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_4043ece_dirty.abi3.so → _optimizer_9c21645_dirty.abi3.so} +1 -1
build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py +198 -39
build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_4043ece_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py +198 -39
build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_4043ece_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py +198 -39
build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py +3 -3
build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_4043ece_dirty.abi3.so → torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so} +1 -1
build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py +198 -39
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_4043ece_dirty.abi3.so → _optimizer_9c21645_dirty.abi3.so} +1 -1
build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py +198 -39
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py +3 -3
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_4043ece_dirty.abi3.so → _optimizer_9c21645_dirty.abi3.so} +1 -1
build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py +198 -39
test/test_muon/test.py +47 -14
torch-ext/optimizer/muon.py +198 -39

build/torch27-cxx11-cu118-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_4043ece_dirty
-ops = torch.ops._optimizer_4043ece_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_4043ece_dirty::{op_name}"

 import torch
+from . import _optimizer_9c21645_dirty
+ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_9c21645_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/optimizer/{_optimizer_4043ece_dirty.abi3.so → _optimizer_9c21645_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cd521b375aefeabe5cd5b38215d71b393e3902ed347426c64307e37c01f79a7c
 size 1787368

 version https://git-lfs.github.com/spec/v1
+oid sha256:bf8b97161714dff91953d26ae0bf59ebc9f3653ce57a3998723cc08aa97b71e6
 size 1787368

build/torch27-cxx11-cu118-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
-from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
@@ -66,6 +66,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
@@ -193,32 +194,93 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         state.scattered_u = None
         u_dtensor = None
 def default_is_muon(name, x):
-    return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
     return [
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            True
         },
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (not is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            False
         },
     ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -246,21 +308,38 @@ class Muon(torch.optim.Optimizer):
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
     """
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        momentum=0.95,
-        nesterov=True,
-        ns_steps=5,
-        weight_decay=0.1,
-        adamw_betas=(0.9, 0.95),
-        adamw_eps=1e-8,
-        none_grad=True,
-        debug=False,
-    ):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -292,6 +371,7 @@ class Muon(torch.optim.Optimizer):
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -327,7 +407,7 @@ class Muon(torch.optim.Optimizer):
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
-    def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -346,15 +426,21 @@ class Muon(torch.optim.Optimizer):
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
-        ordered_params = sorted(params,
-                                key=lambda p: param_to_flops[id(p)],
-                                reverse=True)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
-        for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
@@ -364,14 +450,16 @@ class Muon(torch.optim.Optimizer):
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
-    def base(self, params, group, lr, weight_decay, momentum):
         # generate weight updates in distributed fashion
-        for p in params:
             g = p.grad
             if g is None:
                 continue
@@ -396,6 +484,12 @@ class Muon(torch.optim.Optimizer):
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
@@ -416,7 +510,58 @@ class Muon(torch.optim.Optimizer):
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
-    def parallel(self, params, group, lr, weight_decay, momentum):
         """
         Perform a parallel optimization step using Muon.
         """
@@ -438,7 +583,7 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -553,12 +698,16 @@ class Muon(torch.optim.Optimizer):
                 maximize=maximize,
             )
-    def step(self, closure=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
         loss = None
         if closure is not None:
@@ -575,11 +724,14 @@ class Muon(torch.optim.Optimizer):
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
                 param_dtensors = []
                 param_tensors = []
-                for p in params:
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
@@ -587,10 +739,13 @@ class Muon(torch.optim.Optimizer):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
                         else:
                             param_dtensors.append(p)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
@@ -608,20 +763,24 @@ class Muon(torch.optim.Optimizer):
                         )
                     self.parallel(
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
                 if len(param_tensors) > 0:
                     self.base(
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
             else:

 import math
 import types
 from dataclasses import dataclass
+from typing import List, Optional, Union, cast
 import torch
 import torch.distributed as dist
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
+    qk_clip_state = None
 @torch.no_grad()
         state.scattered_u = None
         u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
 def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
     return [
         {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
         },
         {
+            "params": non_muon_params,
+            "use_muon": False,
         },
     ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
     """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
+        self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
         param_to_state = {}
         param_to_flops = {}
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
+        for n, p in zip(ordered_names, ordered_params):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
         # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
         """
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 maximize=maximize,
             )
+    def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
         if closure is not None:
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
+                names = group["names"]
                 param_dtensors = []
                 param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
+                            name_tensors.append(n)
                         else:
                             param_dtensors.append(p)
+                            name_dtensors.append(n)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
+                        name_tensors.append(n)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
                         )
                     self.parallel(
+                        name_dtensors,
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
                 if len(param_tensors) > 0:
                     self.base(
+                        name_tensors,
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
             else:

build/torch27-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_4043ece_dirty
-ops = torch.ops._optimizer_4043ece_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_4043ece_dirty::{op_name}"

 import torch
+from . import _optimizer_9c21645_dirty
+ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_9c21645_dirty::{op_name}"

build/{torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_4043ece_dirty.abi3.so → torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:282b0d443dc7b9c82703e5fd0f1a0faea94370934a92bef5042bf53ac3cae39c
 size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:42ae6ac1cf967d7d23cac7930c8db635105f60631220a60b9cee060d082f40ae
 size 1824256

build/torch27-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
-from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
@@ -66,6 +66,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
@@ -193,32 +194,93 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         state.scattered_u = None
         u_dtensor = None
 def default_is_muon(name, x):
-    return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
     return [
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            True
         },
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (not is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            False
         },
     ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -246,21 +308,38 @@ class Muon(torch.optim.Optimizer):
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
     """
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        momentum=0.95,
-        nesterov=True,
-        ns_steps=5,
-        weight_decay=0.1,
-        adamw_betas=(0.9, 0.95),
-        adamw_eps=1e-8,
-        none_grad=True,
-        debug=False,
-    ):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -292,6 +371,7 @@ class Muon(torch.optim.Optimizer):
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -327,7 +407,7 @@ class Muon(torch.optim.Optimizer):
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
-    def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -346,15 +426,21 @@ class Muon(torch.optim.Optimizer):
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
-        ordered_params = sorted(params,
-                                key=lambda p: param_to_flops[id(p)],
-                                reverse=True)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
-        for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
@@ -364,14 +450,16 @@ class Muon(torch.optim.Optimizer):
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
-    def base(self, params, group, lr, weight_decay, momentum):
         # generate weight updates in distributed fashion
-        for p in params:
             g = p.grad
             if g is None:
                 continue
@@ -396,6 +484,12 @@ class Muon(torch.optim.Optimizer):
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
@@ -416,7 +510,58 @@ class Muon(torch.optim.Optimizer):
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
-    def parallel(self, params, group, lr, weight_decay, momentum):
         """
         Perform a parallel optimization step using Muon.
         """
@@ -438,7 +583,7 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -553,12 +698,16 @@ class Muon(torch.optim.Optimizer):
                 maximize=maximize,
             )
-    def step(self, closure=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
         loss = None
         if closure is not None:
@@ -575,11 +724,14 @@ class Muon(torch.optim.Optimizer):
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
                 param_dtensors = []
                 param_tensors = []
-                for p in params:
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
@@ -587,10 +739,13 @@ class Muon(torch.optim.Optimizer):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
                         else:
                             param_dtensors.append(p)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
@@ -608,20 +763,24 @@ class Muon(torch.optim.Optimizer):
                         )
                     self.parallel(
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
                 if len(param_tensors) > 0:
                     self.base(
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
             else:

 import math
 import types
 from dataclasses import dataclass
+from typing import List, Optional, Union, cast
 import torch
 import torch.distributed as dist
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
+    qk_clip_state = None
 @torch.no_grad()
         state.scattered_u = None
         u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
 def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
     return [
         {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
         },
         {
+            "params": non_muon_params,
+            "use_muon": False,
         },
     ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
     """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
+        self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
         param_to_state = {}
         param_to_flops = {}
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
+        for n, p in zip(ordered_names, ordered_params):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
         # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
         """
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 maximize=maximize,
             )
+    def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
         if closure is not None:
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
+                names = group["names"]
                 param_dtensors = []
                 param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
+                            name_tensors.append(n)
                         else:
                             param_dtensors.append(p)
+                            name_dtensors.append(n)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
+                        name_tensors.append(n)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
                         )
                     self.parallel(
+                        name_dtensors,
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
                 if len(param_tensors) > 0:
                     self.base(
+                        name_tensors,
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
             else:

build/torch27-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_4043ece_dirty
-ops = torch.ops._optimizer_4043ece_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_4043ece_dirty::{op_name}"

 import torch
+from . import _optimizer_9c21645_dirty
+ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_9c21645_dirty::{op_name}"

build/{torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_4043ece_dirty.abi3.so → torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5cd3d459f72674bcd05ba7cb96111bc90b08eeda3cbe1cd81ec5c0cd11730990
 size 1883344

 version https://git-lfs.github.com/spec/v1
+oid sha256:dae71b7e998e72130093a86f8c983c3379510e23525e3cdcd4afe5c21bf4d3db
 size 1883344

build/torch27-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
-from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
@@ -66,6 +66,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
@@ -193,32 +194,93 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         state.scattered_u = None
         u_dtensor = None
 def default_is_muon(name, x):
-    return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
     return [
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            True
         },
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (not is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            False
         },
     ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -246,21 +308,38 @@ class Muon(torch.optim.Optimizer):
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
     """
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        momentum=0.95,
-        nesterov=True,
-        ns_steps=5,
-        weight_decay=0.1,
-        adamw_betas=(0.9, 0.95),
-        adamw_eps=1e-8,
-        none_grad=True,
-        debug=False,
-    ):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -292,6 +371,7 @@ class Muon(torch.optim.Optimizer):
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -327,7 +407,7 @@ class Muon(torch.optim.Optimizer):
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
-    def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -346,15 +426,21 @@ class Muon(torch.optim.Optimizer):
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
-        ordered_params = sorted(params,
-                                key=lambda p: param_to_flops[id(p)],
-                                reverse=True)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
-        for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
@@ -364,14 +450,16 @@ class Muon(torch.optim.Optimizer):
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
-    def base(self, params, group, lr, weight_decay, momentum):
         # generate weight updates in distributed fashion
-        for p in params:
             g = p.grad
             if g is None:
                 continue
@@ -396,6 +484,12 @@ class Muon(torch.optim.Optimizer):
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
@@ -416,7 +510,58 @@ class Muon(torch.optim.Optimizer):
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
-    def parallel(self, params, group, lr, weight_decay, momentum):
         """
         Perform a parallel optimization step using Muon.
         """
@@ -438,7 +583,7 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -553,12 +698,16 @@ class Muon(torch.optim.Optimizer):
                 maximize=maximize,
             )
-    def step(self, closure=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
         loss = None
         if closure is not None:
@@ -575,11 +724,14 @@ class Muon(torch.optim.Optimizer):
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
                 param_dtensors = []
                 param_tensors = []
-                for p in params:
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
@@ -587,10 +739,13 @@ class Muon(torch.optim.Optimizer):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
                         else:
                             param_dtensors.append(p)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
@@ -608,20 +763,24 @@ class Muon(torch.optim.Optimizer):
                         )
                     self.parallel(
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
                 if len(param_tensors) > 0:
                     self.base(
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
             else:

 import math
 import types
 from dataclasses import dataclass
+from typing import List, Optional, Union, cast
 import torch
 import torch.distributed as dist
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
+    qk_clip_state = None
 @torch.no_grad()
         state.scattered_u = None
         u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
 def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
     return [
         {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
         },
         {
+            "params": non_muon_params,
+            "use_muon": False,
         },
     ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
     """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
+        self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
         param_to_state = {}
         param_to_flops = {}
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
+        for n, p in zip(ordered_names, ordered_params):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
         # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
         """
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 maximize=maximize,
             )
+    def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
         if closure is not None:
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
+                names = group["names"]
                 param_dtensors = []
                 param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
+                            name_tensors.append(n)
                         else:
                             param_dtensors.append(p)
+                            name_dtensors.append(n)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
+                        name_tensors.append(n)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
                         )
                     self.parallel(
+                        name_dtensors,
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
                 if len(param_tensors) > 0:
                     self.base(
+                        name_tensors,
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
             else:

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_4043ece_dirty
-ops = torch.ops._optimizer_4043ece_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_4043ece_dirty::{op_name}"

 import torch
+from . import _optimizer_9c21645_dirty
+ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_9c21645_dirty::{op_name}"

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_4043ece_dirty.abi3.so → _optimizer_9c21645_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:36782463aaeaa8b35d9770743fe068b907085876d957c9d830d468fff4ebc735
 size 1749776

 version https://git-lfs.github.com/spec/v1
+oid sha256:41492cb1479920b654768a5597d88670dd0caeedbdcd73fd63afa31ffc6961d6
 size 1749776

build/torch27-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
-from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
@@ -66,6 +66,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
@@ -193,32 +194,93 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         state.scattered_u = None
         u_dtensor = None
 def default_is_muon(name, x):
-    return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
     return [
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            True
         },
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (not is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            False
         },
     ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -246,21 +308,38 @@ class Muon(torch.optim.Optimizer):
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
     """
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        momentum=0.95,
-        nesterov=True,
-        ns_steps=5,
-        weight_decay=0.1,
-        adamw_betas=(0.9, 0.95),
-        adamw_eps=1e-8,
-        none_grad=True,
-        debug=False,
-    ):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -292,6 +371,7 @@ class Muon(torch.optim.Optimizer):
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -327,7 +407,7 @@ class Muon(torch.optim.Optimizer):
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
-    def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -346,15 +426,21 @@ class Muon(torch.optim.Optimizer):
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
-        ordered_params = sorted(params,
-                                key=lambda p: param_to_flops[id(p)],
-                                reverse=True)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
-        for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
@@ -364,14 +450,16 @@ class Muon(torch.optim.Optimizer):
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
-    def base(self, params, group, lr, weight_decay, momentum):
         # generate weight updates in distributed fashion
-        for p in params:
             g = p.grad
             if g is None:
                 continue
@@ -396,6 +484,12 @@ class Muon(torch.optim.Optimizer):
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
@@ -416,7 +510,58 @@ class Muon(torch.optim.Optimizer):
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
-    def parallel(self, params, group, lr, weight_decay, momentum):
         """
         Perform a parallel optimization step using Muon.
         """
@@ -438,7 +583,7 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -553,12 +698,16 @@ class Muon(torch.optim.Optimizer):
                 maximize=maximize,
             )
-    def step(self, closure=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
         loss = None
         if closure is not None:
@@ -575,11 +724,14 @@ class Muon(torch.optim.Optimizer):
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
                 param_dtensors = []
                 param_tensors = []
-                for p in params:
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
@@ -587,10 +739,13 @@ class Muon(torch.optim.Optimizer):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
                         else:
                             param_dtensors.append(p)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
@@ -608,20 +763,24 @@ class Muon(torch.optim.Optimizer):
                         )
                     self.parallel(
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
                 if len(param_tensors) > 0:
                     self.base(
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
             else:

 import math
 import types
 from dataclasses import dataclass
+from typing import List, Optional, Union, cast
 import torch
 import torch.distributed as dist
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
+    qk_clip_state = None
 @torch.no_grad()
         state.scattered_u = None
         u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
 def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
     return [
         {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
         },
         {
+            "params": non_muon_params,
+            "use_muon": False,
         },
     ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
     """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
+        self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
         param_to_state = {}
         param_to_flops = {}
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
+        for n, p in zip(ordered_names, ordered_params):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
         # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
         """
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 maximize=maximize,
             )
+    def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
         if closure is not None:
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
+                names = group["names"]
                 param_dtensors = []
                 param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
+                            name_tensors.append(n)
                         else:
                             param_dtensors.append(p)
+                            name_dtensors.append(n)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
+                        name_tensors.append(n)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
                         )
                     self.parallel(
+                        name_dtensors,
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
                 if len(param_tensors) > 0:
                     self.base(
+                        name_tensors,
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
             else:

build/torch28-cxx11-cu126-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_4043ece_dirty
-ops = torch.ops._optimizer_4043ece_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_4043ece_dirty::{op_name}"

 import torch
+from . import _optimizer_9c21645_dirty
+ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_9c21645_dirty::{op_name}"

build/{torch27-cxx11-cu126-x86_64-linux/optimizer/_optimizer_4043ece_dirty.abi3.so → torch28-cxx11-cu126-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:282b0d443dc7b9c82703e5fd0f1a0faea94370934a92bef5042bf53ac3cae39c
 size 1824256

 version https://git-lfs.github.com/spec/v1
+oid sha256:42ae6ac1cf967d7d23cac7930c8db635105f60631220a60b9cee060d082f40ae
 size 1824256

build/torch28-cxx11-cu126-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
-from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
@@ -66,6 +66,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
@@ -193,32 +194,93 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         state.scattered_u = None
         u_dtensor = None
 def default_is_muon(name, x):
-    return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
     return [
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            True
         },
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (not is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            False
         },
     ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -246,21 +308,38 @@ class Muon(torch.optim.Optimizer):
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
     """
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        momentum=0.95,
-        nesterov=True,
-        ns_steps=5,
-        weight_decay=0.1,
-        adamw_betas=(0.9, 0.95),
-        adamw_eps=1e-8,
-        none_grad=True,
-        debug=False,
-    ):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -292,6 +371,7 @@ class Muon(torch.optim.Optimizer):
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -327,7 +407,7 @@ class Muon(torch.optim.Optimizer):
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
-    def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -346,15 +426,21 @@ class Muon(torch.optim.Optimizer):
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
-        ordered_params = sorted(params,
-                                key=lambda p: param_to_flops[id(p)],
-                                reverse=True)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
-        for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
@@ -364,14 +450,16 @@ class Muon(torch.optim.Optimizer):
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
-    def base(self, params, group, lr, weight_decay, momentum):
         # generate weight updates in distributed fashion
-        for p in params:
             g = p.grad
             if g is None:
                 continue
@@ -396,6 +484,12 @@ class Muon(torch.optim.Optimizer):
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
@@ -416,7 +510,58 @@ class Muon(torch.optim.Optimizer):
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
-    def parallel(self, params, group, lr, weight_decay, momentum):
         """
         Perform a parallel optimization step using Muon.
         """
@@ -438,7 +583,7 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -553,12 +698,16 @@ class Muon(torch.optim.Optimizer):
                 maximize=maximize,
             )
-    def step(self, closure=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
         loss = None
         if closure is not None:
@@ -575,11 +724,14 @@ class Muon(torch.optim.Optimizer):
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
                 param_dtensors = []
                 param_tensors = []
-                for p in params:
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
@@ -587,10 +739,13 @@ class Muon(torch.optim.Optimizer):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
                         else:
                             param_dtensors.append(p)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
@@ -608,20 +763,24 @@ class Muon(torch.optim.Optimizer):
                         )
                     self.parallel(
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
                 if len(param_tensors) > 0:
                     self.base(
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
             else:

 import math
 import types
 from dataclasses import dataclass
+from typing import List, Optional, Union, cast
 import torch
 import torch.distributed as dist
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
+    qk_clip_state = None
 @torch.no_grad()
         state.scattered_u = None
         u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
 def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
     return [
         {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
         },
         {
+            "params": non_muon_params,
+            "use_muon": False,
         },
     ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
     """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
+        self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
         param_to_state = {}
         param_to_flops = {}
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
+        for n, p in zip(ordered_names, ordered_params):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
         # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
         """
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 maximize=maximize,
             )
+    def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
         if closure is not None:
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
+                names = group["names"]
                 param_dtensors = []
                 param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
+                            name_tensors.append(n)
                         else:
                             param_dtensors.append(p)
+                            name_dtensors.append(n)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
+                        name_tensors.append(n)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
                         )
                     self.parallel(
+                        name_dtensors,
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
                 if len(param_tensors) > 0:
                     self.base(
+                        name_tensors,
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
             else:

build/torch28-cxx11-cu128-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_4043ece_dirty
-ops = torch.ops._optimizer_4043ece_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_4043ece_dirty::{op_name}"

 import torch
+from . import _optimizer_9c21645_dirty
+ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_9c21645_dirty::{op_name}"

build/{torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_4043ece_dirty.abi3.so → torch28-cxx11-cu128-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dff83fb4e6107a9447ae36fa98c19a873d71525898fde676c51252396c02a633
 size 1883344

 version https://git-lfs.github.com/spec/v1
+oid sha256:dae71b7e998e72130093a86f8c983c3379510e23525e3cdcd4afe5c21bf4d3db
 size 1883344

build/torch28-cxx11-cu128-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
-from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
@@ -66,6 +66,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
@@ -193,32 +194,93 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         state.scattered_u = None
         u_dtensor = None
 def default_is_muon(name, x):
-    return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
     return [
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            True
         },
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (not is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            False
         },
     ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -246,21 +308,38 @@ class Muon(torch.optim.Optimizer):
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
     """
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        momentum=0.95,
-        nesterov=True,
-        ns_steps=5,
-        weight_decay=0.1,
-        adamw_betas=(0.9, 0.95),
-        adamw_eps=1e-8,
-        none_grad=True,
-        debug=False,
-    ):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -292,6 +371,7 @@ class Muon(torch.optim.Optimizer):
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -327,7 +407,7 @@ class Muon(torch.optim.Optimizer):
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
-    def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -346,15 +426,21 @@ class Muon(torch.optim.Optimizer):
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
-        ordered_params = sorted(params,
-                                key=lambda p: param_to_flops[id(p)],
-                                reverse=True)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
-        for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
@@ -364,14 +450,16 @@ class Muon(torch.optim.Optimizer):
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
-    def base(self, params, group, lr, weight_decay, momentum):
         # generate weight updates in distributed fashion
-        for p in params:
             g = p.grad
             if g is None:
                 continue
@@ -396,6 +484,12 @@ class Muon(torch.optim.Optimizer):
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
@@ -416,7 +510,58 @@ class Muon(torch.optim.Optimizer):
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
-    def parallel(self, params, group, lr, weight_decay, momentum):
         """
         Perform a parallel optimization step using Muon.
         """
@@ -438,7 +583,7 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -553,12 +698,16 @@ class Muon(torch.optim.Optimizer):
                 maximize=maximize,
             )
-    def step(self, closure=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
         loss = None
         if closure is not None:
@@ -575,11 +724,14 @@ class Muon(torch.optim.Optimizer):
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
                 param_dtensors = []
                 param_tensors = []
-                for p in params:
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
@@ -587,10 +739,13 @@ class Muon(torch.optim.Optimizer):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
                         else:
                             param_dtensors.append(p)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
@@ -608,20 +763,24 @@ class Muon(torch.optim.Optimizer):
                         )
                     self.parallel(
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
                 if len(param_tensors) > 0:
                     self.base(
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
             else:

 import math
 import types
 from dataclasses import dataclass
+from typing import List, Optional, Union, cast
 import torch
 import torch.distributed as dist
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
+    qk_clip_state = None
 @torch.no_grad()
         state.scattered_u = None
         u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
 def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
     return [
         {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
         },
         {
+            "params": non_muon_params,
+            "use_muon": False,
         },
     ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
     """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
+        self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
         param_to_state = {}
         param_to_flops = {}
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
+        for n, p in zip(ordered_names, ordered_params):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
         # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
         """
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 maximize=maximize,
             )
+    def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
         if closure is not None:
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
+                names = group["names"]
                 param_dtensors = []
                 param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
+                            name_tensors.append(n)
                         else:
                             param_dtensors.append(p)
+                            name_dtensors.append(n)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
+                        name_tensors.append(n)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
                         )
                     self.parallel(
+                        name_dtensors,
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
                 if len(param_tensors) > 0:
                     self.base(
+                        name_tensors,
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
             else:

build/torch28-cxx11-cu129-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_4043ece_dirty
-ops = torch.ops._optimizer_4043ece_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_4043ece_dirty::{op_name}"

 import torch
+from . import _optimizer_9c21645_dirty
+ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_9c21645_dirty::{op_name}"

build/{torch27-cxx11-cu128-x86_64-linux/optimizer/_optimizer_4043ece_dirty.abi3.so → torch28-cxx11-cu129-x86_64-linux/optimizer/_optimizer_9c21645_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5cd3d459f72674bcd05ba7cb96111bc90b08eeda3cbe1cd81ec5c0cd11730990
 size 1883344

 version https://git-lfs.github.com/spec/v1
+oid sha256:eb40a06623bb3668b82ff248b5a3c1bcf41e7f3f860888b261505b3a71257bc7
 size 1883344

build/torch28-cxx11-cu129-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
-from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
@@ -66,6 +66,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
@@ -193,32 +194,93 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         state.scattered_u = None
         u_dtensor = None
 def default_is_muon(name, x):
-    return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
     return [
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            True
         },
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (not is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            False
         },
     ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -246,21 +308,38 @@ class Muon(torch.optim.Optimizer):
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
     """
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        momentum=0.95,
-        nesterov=True,
-        ns_steps=5,
-        weight_decay=0.1,
-        adamw_betas=(0.9, 0.95),
-        adamw_eps=1e-8,
-        none_grad=True,
-        debug=False,
-    ):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -292,6 +371,7 @@ class Muon(torch.optim.Optimizer):
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -327,7 +407,7 @@ class Muon(torch.optim.Optimizer):
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
-    def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -346,15 +426,21 @@ class Muon(torch.optim.Optimizer):
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
-        ordered_params = sorted(params,
-                                key=lambda p: param_to_flops[id(p)],
-                                reverse=True)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
-        for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
@@ -364,14 +450,16 @@ class Muon(torch.optim.Optimizer):
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
-    def base(self, params, group, lr, weight_decay, momentum):
         # generate weight updates in distributed fashion
-        for p in params:
             g = p.grad
             if g is None:
                 continue
@@ -396,6 +484,12 @@ class Muon(torch.optim.Optimizer):
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
@@ -416,7 +510,58 @@ class Muon(torch.optim.Optimizer):
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
-    def parallel(self, params, group, lr, weight_decay, momentum):
         """
         Perform a parallel optimization step using Muon.
         """
@@ -438,7 +583,7 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -553,12 +698,16 @@ class Muon(torch.optim.Optimizer):
                 maximize=maximize,
             )
-    def step(self, closure=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
         loss = None
         if closure is not None:
@@ -575,11 +724,14 @@ class Muon(torch.optim.Optimizer):
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
                 param_dtensors = []
                 param_tensors = []
-                for p in params:
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
@@ -587,10 +739,13 @@ class Muon(torch.optim.Optimizer):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
                         else:
                             param_dtensors.append(p)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
@@ -608,20 +763,24 @@ class Muon(torch.optim.Optimizer):
                         )
                     self.parallel(
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
                 if len(param_tensors) > 0:
                     self.base(
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
             else:

 import math
 import types
 from dataclasses import dataclass
+from typing import List, Optional, Union, cast
 import torch
 import torch.distributed as dist
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
+    qk_clip_state = None
 @torch.no_grad()
         state.scattered_u = None
         u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
 def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
     return [
         {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
         },
         {
+            "params": non_muon_params,
+            "use_muon": False,
         },
     ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
     """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
+        self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
         param_to_state = {}
         param_to_flops = {}
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
+        for n, p in zip(ordered_names, ordered_params):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
         # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
         """
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 maximize=maximize,
             )
+    def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
         if closure is not None:
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
+                names = group["names"]
                 param_dtensors = []
                 param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
+                            name_tensors.append(n)
                         else:
                             param_dtensors.append(p)
+                            name_dtensors.append(n)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
+                        name_tensors.append(n)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
                         )
                     self.parallel(
+                        name_dtensors,
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
                 if len(param_tensors) > 0:
                     self.base(
+                        name_tensors,
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
             else:

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_4043ece_dirty
-ops = torch.ops._optimizer_4043ece_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_4043ece_dirty::{op_name}"

 import torch
+from . import _optimizer_9c21645_dirty
+ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_9c21645_dirty::{op_name}"

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/{_optimizer_4043ece_dirty.abi3.so → _optimizer_9c21645_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:752f4346d75c9ede747a6baf4102022bc4bd776db86b5dbd74e47c2a112547ea
 size 1749936

 version https://git-lfs.github.com/spec/v1
+oid sha256:d8f845b8df6426eb5db57e4525b8dd3c80004c44759b01a3e39cc37a817813b5
 size 1749936

build/torch28-cxx11-rocm63-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
-from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
@@ -66,6 +66,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
@@ -193,32 +194,93 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         state.scattered_u = None
         u_dtensor = None
 def default_is_muon(name, x):
-    return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
     return [
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            True
         },
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (not is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            False
         },
     ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -246,21 +308,38 @@ class Muon(torch.optim.Optimizer):
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
     """
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        momentum=0.95,
-        nesterov=True,
-        ns_steps=5,
-        weight_decay=0.1,
-        adamw_betas=(0.9, 0.95),
-        adamw_eps=1e-8,
-        none_grad=True,
-        debug=False,
-    ):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -292,6 +371,7 @@ class Muon(torch.optim.Optimizer):
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -327,7 +407,7 @@ class Muon(torch.optim.Optimizer):
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
-    def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -346,15 +426,21 @@ class Muon(torch.optim.Optimizer):
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
-        ordered_params = sorted(params,
-                                key=lambda p: param_to_flops[id(p)],
-                                reverse=True)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
-        for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
@@ -364,14 +450,16 @@ class Muon(torch.optim.Optimizer):
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
-    def base(self, params, group, lr, weight_decay, momentum):
         # generate weight updates in distributed fashion
-        for p in params:
             g = p.grad
             if g is None:
                 continue
@@ -396,6 +484,12 @@ class Muon(torch.optim.Optimizer):
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
@@ -416,7 +510,58 @@ class Muon(torch.optim.Optimizer):
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
-    def parallel(self, params, group, lr, weight_decay, momentum):
         """
         Perform a parallel optimization step using Muon.
         """
@@ -438,7 +583,7 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -553,12 +698,16 @@ class Muon(torch.optim.Optimizer):
                 maximize=maximize,
             )
-    def step(self, closure=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
         loss = None
         if closure is not None:
@@ -575,11 +724,14 @@ class Muon(torch.optim.Optimizer):
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
                 param_dtensors = []
                 param_tensors = []
-                for p in params:
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
@@ -587,10 +739,13 @@ class Muon(torch.optim.Optimizer):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
                         else:
                             param_dtensors.append(p)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
@@ -608,20 +763,24 @@ class Muon(torch.optim.Optimizer):
                         )
                     self.parallel(
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
                 if len(param_tensors) > 0:
                     self.base(
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
             else:

 import math
 import types
 from dataclasses import dataclass
+from typing import List, Optional, Union, cast
 import torch
 import torch.distributed as dist
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
+    qk_clip_state = None
 @torch.no_grad()
         state.scattered_u = None
         u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
 def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
     return [
         {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
         },
         {
+            "params": non_muon_params,
+            "use_muon": False,
         },
     ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
     """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
+        self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
         param_to_state = {}
         param_to_flops = {}
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
+        for n, p in zip(ordered_names, ordered_params):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
         # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
         """
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 maximize=maximize,
             )
+    def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
         if closure is not None:
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
+                names = group["names"]
                 param_dtensors = []
                 param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
+                            name_tensors.append(n)
                         else:
                             param_dtensors.append(p)
+                            name_dtensors.append(n)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
+                        name_tensors.append(n)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
                         )
                     self.parallel(
+                        name_dtensors,
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
                 if len(param_tensors) > 0:
                     self.base(
+                        name_tensors,
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
             else:

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _optimizer_4043ece_dirty
-ops = torch.ops._optimizer_4043ece_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_optimizer_4043ece_dirty::{op_name}"

 import torch
+from . import _optimizer_9c21645_dirty
+ops = torch.ops._optimizer_9c21645_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_optimizer_9c21645_dirty::{op_name}"

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/{_optimizer_4043ece_dirty.abi3.so → _optimizer_9c21645_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2ccbbb16b7d65cd7a4cb562dbef2a3d963f042836c700527a6bb755a8277f0c1
 size 1750024

 version https://git-lfs.github.com/spec/v1
+oid sha256:9a477575e3cc30e54d355b3e778240dc25fb0dab30362f3540dc5f925ac03ba1
 size 1750024

build/torch28-cxx11-rocm64-x86_64-linux/optimizer/muon.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
-from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
@@ -66,6 +66,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
@@ -193,32 +194,93 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         state.scattered_u = None
         u_dtensor = None
 def default_is_muon(name, x):
-    return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
     return [
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            True
         },
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (not is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            False
         },
     ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -246,21 +308,38 @@ class Muon(torch.optim.Optimizer):
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
     """
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        momentum=0.95,
-        nesterov=True,
-        ns_steps=5,
-        weight_decay=0.1,
-        adamw_betas=(0.9, 0.95),
-        adamw_eps=1e-8,
-        none_grad=True,
-        debug=False,
-    ):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -292,6 +371,7 @@ class Muon(torch.optim.Optimizer):
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -327,7 +407,7 @@ class Muon(torch.optim.Optimizer):
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
-    def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -346,15 +426,21 @@ class Muon(torch.optim.Optimizer):
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
-        ordered_params = sorted(params,
-                                key=lambda p: param_to_flops[id(p)],
-                                reverse=True)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
-        for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
@@ -364,14 +450,16 @@ class Muon(torch.optim.Optimizer):
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
-    def base(self, params, group, lr, weight_decay, momentum):
         # generate weight updates in distributed fashion
-        for p in params:
             g = p.grad
             if g is None:
                 continue
@@ -396,6 +484,12 @@ class Muon(torch.optim.Optimizer):
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
@@ -416,7 +510,58 @@ class Muon(torch.optim.Optimizer):
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
-    def parallel(self, params, group, lr, weight_decay, momentum):
         """
         Perform a parallel optimization step using Muon.
         """
@@ -438,7 +583,7 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -553,12 +698,16 @@ class Muon(torch.optim.Optimizer):
                 maximize=maximize,
             )
-    def step(self, closure=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
         loss = None
         if closure is not None:
@@ -575,11 +724,14 @@ class Muon(torch.optim.Optimizer):
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
                 param_dtensors = []
                 param_tensors = []
-                for p in params:
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
@@ -587,10 +739,13 @@ class Muon(torch.optim.Optimizer):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
                         else:
                             param_dtensors.append(p)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
@@ -608,20 +763,24 @@ class Muon(torch.optim.Optimizer):
                         )
                     self.parallel(
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
                 if len(param_tensors) > 0:
                     self.base(
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
             else:

 import math
 import types
 from dataclasses import dataclass
+from typing import List, Optional, Union, cast
 import torch
 import torch.distributed as dist
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
+    qk_clip_state = None
 @torch.no_grad()
         state.scattered_u = None
         u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
 def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
     return [
         {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
         },
         {
+            "params": non_muon_params,
+            "use_muon": False,
         },
     ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
     """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
+        self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
         param_to_state = {}
         param_to_flops = {}
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
+        for n, p in zip(ordered_names, ordered_params):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
         # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
         """
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 maximize=maximize,
             )
+    def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
         if closure is not None:
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
+                names = group["names"]
                 param_dtensors = []
                 param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
+                            name_tensors.append(n)
                         else:
                             param_dtensors.append(p)
+                            name_dtensors.append(n)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
+                        name_tensors.append(n)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
                         )
                     self.parallel(
+                        name_dtensors,
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
                 if len(param_tensors) > 0:
                     self.base(
+                        name_tensors,
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
             else:

test/test_muon/test.py CHANGED Viewed

@@ -20,7 +20,6 @@ def load_model(fsdp: bool) -> torch.nn.Module:
         trust_remote_code=True,
     ).bfloat16().cuda()
-    torch.manual_seed(0)
     random_grads = []
     for param in model.parameters():
         random_grad = torch.randn_like(param,
@@ -52,17 +51,57 @@ def load_model(fsdp: bool) -> torch.nn.Module:
     return model
-def run_muon(fsdp: bool) -> torch.nn.Module:
     model = load_model(fsdp=fsdp)
     params = get_default_muon_param_groups(model)
-    optim = Muon(params=params)
-    optim.step()
     return model
 def compare_results(parallel_muon_result: torch.nn.Module,
-                    sequential_muon_result: torch.nn.Module) -> None:
     for (name_p, p), (name_s,
                       s) in zip(parallel_muon_result.named_parameters(),
                                 sequential_muon_result.named_parameters()):
@@ -71,16 +110,10 @@ def compare_results(parallel_muon_result: torch.nn.Module,
         # Parallel Muon should exactly match Sequential Muon
         if torch.abs(p - s).max() > 0:
             max_diff_index = torch.argmax(torch.abs(p - s))
-            logger.error(f"Models differ at parameter {name_p}")
-            return
-    logger.info("Models match!")
-def test_muon():
-    parallel_muon_result = run_muon(fsdp=True)
-    sequential_muon_result = run_muon(fsdp=False)
-    compare_results(parallel_muon_result, sequential_muon_result)
 if __name__ == "__main__":

         trust_remote_code=True,
     ).bfloat16().cuda()
     random_grads = []
     for param in model.parameters():
         random_grad = torch.randn_like(param,
     return model
+def run_muon(fsdp: bool, qk_clip: bool, seed: int) -> torch.nn.Module:
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
     model = load_model(fsdp=fsdp)
     params = get_default_muon_param_groups(model)
+    qk_logits = None
+    if qk_clip:
+        qk_logits = {
+            i: torch.rand(model.config.num_attention_heads)
+            for i in range(model.config.num_hidden_layers)
+        }
+    optim = Muon(
+        params=params,
+        clip_config={
+            "q_indices": list(range(model.config.num_attention_heads)),
+            "k_indices": list(range(model.config.num_attention_heads)),
+            "head_dim":
+            model.config.hidden_size // model.config.num_attention_heads,
+            "threshold": 0.5
+        })
+    optim.step(qk_logits=qk_logits)
     return model
+def run_case(qk_clip: bool, seed: int = 0):
+    parallel_muon_result = run_muon(fsdp=True, qk_clip=qk_clip, seed=seed)
+    sequential_muon_result = run_muon(fsdp=False, qk_clip=qk_clip, seed=seed)
+    label = f"qk_clip={'ON' if qk_clip else 'OFF'}"
+    success = compare_results(parallel_muon_result,
+                              sequential_muon_result,
+                              label=label)
+    return success, label
+def test_muon():
+    base_result = run_case(qk_clip=False, seed=0)
+    clip_result = run_case(qk_clip=True, seed=0)
+    for success, label in [base_result, clip_result]:
+        if success:
+            logger.info(f"[{label}] Models match")
 def compare_results(parallel_muon_result: torch.nn.Module,
+                    sequential_muon_result: torch.nn.Module,
+                    label: str) -> None:
+    success = True
     for (name_p, p), (name_s,
                       s) in zip(parallel_muon_result.named_parameters(),
                                 sequential_muon_result.named_parameters()):
         # Parallel Muon should exactly match Sequential Muon
         if torch.abs(p - s).max() > 0:
             max_diff_index = torch.argmax(torch.abs(p - s))
+            logger.info(f"Models differ at parameter {name_p}")
+            success = False
+    return success
 if __name__ == "__main__":

torch-ext/optimizer/muon.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 import math
 import types
 from dataclasses import dataclass
-from typing import Optional, Union, cast
 import torch
 import torch.distributed as dist
@@ -66,6 +66,7 @@ class _muon_state:
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
 @torch.no_grad()
@@ -193,32 +194,93 @@ def _update_param(p, state, lr, adjusted_lr, weight_decay, rank,
         state.scattered_u = None
         u_dtensor = None
 def default_is_muon(name, x):
-    return x.ndim >= 2 and "embed_tokens" not in name and "lm_head" not in name
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
     return [
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            True
         },
         {
-            "params": [
-                p for n, p in model.named_parameters()
-                if (not is_muon_func(n, p) and p.requires_grad)
-            ],
-            "use_muon":
-            False
         },
     ]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
@@ -246,21 +308,38 @@ class Muon(torch.optim.Optimizer):
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
     """
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        momentum=0.95,
-        nesterov=True,
-        ns_steps=5,
-        weight_decay=0.1,
-        adamw_betas=(0.9, 0.95),
-        adamw_eps=1e-8,
-        none_grad=True,
-        debug=False,
-    ):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
@@ -292,6 +371,7 @@ class Muon(torch.optim.Optimizer):
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
@@ -327,7 +407,7 @@ class Muon(torch.optim.Optimizer):
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
-    def init_state_and_assign_params(self, params, group):
         param_to_state = {}
         param_to_flops = {}
@@ -346,15 +426,21 @@ class Muon(torch.optim.Optimizer):
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
-        ordered_params = sorted(params,
-                                key=lambda p: param_to_flops[id(p)],
-                                reverse=True)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
-        for p in ordered_params:
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
@@ -364,14 +450,16 @@ class Muon(torch.optim.Optimizer):
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
-    def base(self, params, group, lr, weight_decay, momentum):
         # generate weight updates in distributed fashion
-        for p in params:
             g = p.grad
             if g is None:
                 continue
@@ -396,6 +484,12 @@ class Muon(torch.optim.Optimizer):
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
@@ -416,7 +510,58 @@ class Muon(torch.optim.Optimizer):
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
-    def parallel(self, params, group, lr, weight_decay, momentum):
         """
         Perform a parallel optimization step using Muon.
         """
@@ -438,7 +583,7 @@ class Muon(torch.optim.Optimizer):
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
-            params, group)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
@@ -553,12 +698,16 @@ class Muon(torch.optim.Optimizer):
                 maximize=maximize,
             )
-    def step(self, closure=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
         loss = None
         if closure is not None:
@@ -575,11 +724,14 @@ class Muon(torch.optim.Optimizer):
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
                 param_dtensors = []
                 param_tensors = []
-                for p in params:
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
@@ -587,10 +739,13 @@ class Muon(torch.optim.Optimizer):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
                         else:
                             param_dtensors.append(p)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
@@ -608,20 +763,24 @@ class Muon(torch.optim.Optimizer):
                         )
                     self.parallel(
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
                 if len(param_tensors) > 0:
                     self.base(
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
                     )
             else:

 import math
 import types
 from dataclasses import dataclass
+from typing import List, Optional, Union, cast
 import torch
 import torch.distributed as dist
     compute_event: torch.cuda.Event | None = None
     scatter_event: torch.cuda.Event | None = None
     process_group = None
+    qk_clip_state = None
 @torch.no_grad()
         state.scattered_u = None
         u_dtensor = None
+        scales_full = Muon._compute_scales(p, state.qk_clip_state)
+        if scales_full is not None:
+            num_ranks = dist.get_world_size(group=state.process_group)
+            local_rank = dist.get_rank(group=state.process_group)
+            scales_local = scales_full.chunk(num_ranks, dim=0)[local_rank]
+            scales_local = DTensor.from_local(
+                scales_local,
+                placements=p.placements,
+                device_mesh=p.device_mesh,
+            )
+            Muon._qk_clip(p, scales_local, state.qk_clip_state.head_dim)
 def default_is_muon(name, x):
+    skip_keys = ["embed_tokens", "lm_head", "tok_embeddings", "output"]
+    return x.ndim >= 2 and not any(key in name for key in skip_keys)
 def get_default_muon_param_groups(model, is_muon_func=default_is_muon):
+    muon_params, muon_names = [], []
+    non_muon_params = []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        if is_muon_func(n, p):
+            muon_params.append(p)
+            muon_names.append(n)
+        else:
+            non_muon_params.append(p)
     return [
         {
+            "params": muon_params,
+            "names": muon_names,
+            "use_muon": True,
         },
         {
+            "params": non_muon_params,
+            "use_muon": False,
         },
     ]
+def parse_qk_layer(name: str) -> tuple[str | None, int]:
+    """
+    Parse a parameter name to check if it is a query/key projection layer
+    ('wq', 'wk', 'q_proj', 'k_proj') and return (kind, layer_index).
+    Returns:
+        (kind, layer_idx) or (None, -1) if not matched.
+    Example:
+        'model.3.attn.wq.weight'      -> ('wq', 3)
+        'model.5.attn.wk.weight'      -> ('wk', 5)
+        'model.2.attn.q_proj.weight'  -> ('q_proj', 2)
+        'model.7.attn.k_proj.weight'  -> ('k_proj', 7)
+        'model.4.attn.v_proj.weight'  -> (None, -1)
+    """
+    parts = name.split('.')
+    if len(parts) < 3:
+        return None, -1
+    kind = parts[-2]
+    layer_idx = -1
+    for part in reversed(parts):
+        if part.isdigit():
+            layer_idx = int(part)
+            break
+    if kind in ('wq', 'wk', 'q_proj', 'k_proj'):
+        return kind, layer_idx
+    return None, -1
+@dataclass
+class QKClipInfo:
+    """Per-parameter dynamic info computed from config + runtime logits."""
+    kind: Optional[str]  # 'wq'/'q_proj' or 'wk'/'k_proj' or None
+    indices: List[int]  # which heads to consider for clipping
+    head_dim: int  # from config
+    threshold: float  # from config
+    logit: Optional[torch.Tensor]
 class Muon(torch.optim.Optimizer):
     """
     Muon - MomentUm Orthogonalized by Newton-schulz
         adamw_eps: The epsilon for the internal AdamW.
         none_grad: Whether to set p.grad to None after gathering the gradients. This can save memory.
         debug: Whether to print debug information.
+        clip_info : Configuration for QK clipping. Expected keys:
+            - "q_indices" (list[int]): Indices of query heads to consider.
+            - "k_indices" (list[int]): Indices of key heads to consider.
+            - "head_dim" (int): Dimensionality of each attention head.
+            - "threshold" (float): Threshold value; heads whose QK logits exceed
+            this value will be scaled down.
+            Default is:
+                {
+                    "q_indices": [],
+                    "k_indices": [],
+                    "head_dim": 128,
+                    "threshold": 100
+                }
     """
+    def __init__(self,
+                 params,
+                 lr=1e-3,
+                 momentum=0.95,
+                 nesterov=True,
+                 ns_steps=5,
+                 weight_decay=0.1,
+                 adamw_betas=(0.9, 0.95),
+                 adamw_eps=1e-8,
+                 none_grad=True,
+                 debug=False,
+                 clip_config={
+                     "q_indices": [],
+                     "k_indices": [],
+                     "head_dim": 128,
+                     "threshold": 100
+                 }):
         defaults = dict(
             lr=lr,
             weight_decay=weight_decay,
         self.comm_stream = torch.cuda.Stream()
         self.compute_stream = torch.cuda.Stream()
         self.debug = debug
+        self.clip_config = clip_config
     def _calc_flops(self, G, steps):
         assert len(G.shape) == 2
         else:
             raise ValueError(f"Unsupported placements ({p.placements}).")
+    def init_state_and_assign_params(self, names, params, group, qk_logits):
         param_to_state = {}
         param_to_flops = {}
             print(f"Total TFLOPs for Muon: {total_flops / 1e12:.2f} TFLOPs",
                   flush=True)
+        paired = list(zip(names, params))
+        paired_sorted = sorted(paired,
+                               key=lambda x: param_to_flops[id(x[1])],
+                               reverse=True)
+        names_sorted, params_sorted = zip(*paired_sorted)
+        ordered_names = list(names_sorted)
+        ordered_params = list(params_sorted)
         round_robin = 0
         mesh = None
         shard_mesh = None
         process_group = None
+        for n, p in zip(ordered_names, ordered_params):
             if mesh is None:
                 mesh = p.device_mesh
                 shard_mesh, process_group = self.get_shard_mesh(p, self.rank)
             param_to_state[id(p)] = _muon_state()
             param_to_state[id(p)].worker_rank = shard_mesh[round_robin].item()
             param_to_state[id(p)].process_group = process_group
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            param_to_state[id(p)].qk_clip_state = qk_clip_state
             round_robin = (round_robin + 1) % len(shard_mesh)
         return param_to_state, ordered_params
+    def base(self, names, params, group, lr, weight_decay, momentum,
+             qk_logits):
         # generate weight updates in distributed fashion
+        for n, p in zip(names, params):
             g = p.grad
             if g is None:
                 continue
             adjusted_lr = self.adjust_lr_for_muon(lr, p.shape)
             Muon._update_p(p, u, lr, adjusted_lr, weight_decay)
+            qk_clip_state = self.get_qk_clip_info(n, qk_logits)
+            scales_full = self._compute_scales(p, qk_clip_state)
+            if scales_full is not None:
+                Muon._qk_clip(p, scales_full, qk_clip_state.head_dim)
     def _update_g(self, p, g, group, momentum):
         # calc update
         state = self.state[p]
         # apply update
         p.data.add_(u, alpha=-adjusted_lr)
+    def get_qk_clip_info(self, n, qk_logits):
+        head_dim = self.clip_config.get('head_dim')
+        threshold = self.clip_config.get('threshold')
+        kind, layer_idx = parse_qk_layer(n)
+        logit, indices = None, []
+        if qk_logits is not None and kind is not None:
+            logit = qk_logits[layer_idx]
+            indices_key = 'q_indices' if 'q' in kind else 'k_indices'
+            indices = self.clip_config.get(indices_key, []) or []
+        return QKClipInfo(
+            kind=kind,
+            indices=indices,
+            head_dim=head_dim,
+            threshold=threshold,
+            logit=logit,
+        )
+    @staticmethod
+    def _compute_scales(p, qk_clip_state):
+        kind = qk_clip_state.kind
+        indices = qk_clip_state.indices
+        head_dim = qk_clip_state.head_dim
+        threshold = qk_clip_state.threshold
+        logit = qk_clip_state.logit
+        H_global = p.shape[0] // head_dim
+        scales_full = torch.ones(H_global, device=p.data.device)
+        scaling = 0
+        for logit_idx, head_idx in enumerate(indices):
+            v_ele = float(logit[logit_idx])
+            if v_ele > threshold:
+                new_scale = math.sqrt(threshold / v_ele)
+                if new_scale < scales_full[head_idx]:
+                    scales_full[head_idx] = new_scale
+                    logger.info(
+                        f"[{kind}] Head {head_idx} exceeded threshold "
+                        f"(value={v_ele:.4f}, threshold={threshold:.4f}) -> applying scale={new_scale:.4f}"
+                    )
+                    scaling += 1
+        return scales_full if scaling > 0 else None
+    @staticmethod
+    def _qk_clip(p, scales, head_dim):
+        W = p.data.view(-1, head_dim, p.data.shape[1])
+        W.mul_(scales.view(-1, 1, 1))
+    def parallel(self, names, params, group, lr, weight_decay, momentum,
+                 qk_logits):
         """
         Perform a parallel optimization step using Muon.
         """
             p.grad = g
         param_to_state, ordered_params = self.init_state_and_assign_params(
+            names, params, group, qk_logits)
         def enqueue_gathers(start_idx, chunk_size):
             for p in ordered_params[start_idx:start_idx + chunk_size]:
                 maximize=maximize,
             )
+    def step(self, closure=None, qk_logits=None):
         """Perform a single optimization step.
         Args:
             closure (Callable, optional): A closure that reevaluates the model
                 and returns the loss.
+            qk_logits (dict[int, Tensor], optional): A dictionary mapping layer indices
+                to 1D tensors of shape (num_heads,), representing the maximum
+                QK logits across all tokens, computed as
+                (1 / sqrt(head_dim)) * (Q @ K^T).
         """
         loss = None
         if closure is not None:
                 lr = group["lr"]
                 weight_decay = group["weight_decay"]
                 momentum = group["momentum"]
+                names = group["names"]
                 param_dtensors = []
                 param_tensors = []
+                name_dtensors = []
+                name_tensors = []
+                for n, p in zip(names, params):
                     if p is None or p.grad is None:
                         continue
                     if isinstance(p.data, DTensor):
                                 isinstance(placement, Replicate)
                                 for placement in p.placements):
                             param_tensors.append(p)
+                            name_tensors.append(n)
                         else:
                             param_dtensors.append(p)
+                            name_dtensors.append(n)
                     elif isinstance(p.data, torch.Tensor):
                         param_tensors.append(p)
+                        name_tensors.append(n)
                     else:
                         raise TypeError(
                             f"Unsupported parameter type: {type(p.data)}")
                         )
                     self.parallel(
+                        name_dtensors,
                         param_dtensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
                 if len(param_tensors) > 0:
                     self.base(
+                        name_tensors,
                         param_tensors,
                         group,
                         lr=lr,
                         weight_decay=weight_decay,
                         momentum=momentum,
+                        qk_logits=qk_logits,
                     )
             else: