feat: impl backward experts

Files changed (11) hide show

.gitignore +5 -1
build.toml +2 -1
compare_example.py +147 -46
csrc/experts_backward.cu +341 -0
readme_example.py +4 -3
torch-ext/torch_binding.cpp +14 -0
torch-ext/torch_binding.h +16 -0
torch-ext/yamoe/__init__.py +13 -2
torch-ext/yamoe/layers.py +104 -0
torch-ext/yamoe/{reference.py → vendored/gpt_oss_mlp.py} +111 -25
torch-ext/yamoe/vendored/yamoe_ref.py +82 -0

.gitignore CHANGED Viewed

@@ -12,4 +12,8 @@ tests
 torch-ext/registration.h
 torch-ext/yamoe/_ops.py
 csrc/batch_mm.cu
-torch-ext/yamoe/*.abi3.so

 torch-ext/registration.h
 torch-ext/yamoe/_ops.py
 csrc/batch_mm.cu
+torch-ext/yamoe/*.abi3.so
+build-ext
+build
+exploration

build.toml CHANGED Viewed

@@ -32,5 +32,6 @@ src = [
     "csrc/sort.cu",
     "csrc/bincount_cumsum.cu",
     "csrc/batch_mm.cu",
-    "csrc/moe.cpp"
 ]

     "csrc/sort.cu",
     "csrc/bincount_cumsum.cu",
     "csrc/batch_mm.cu",
+    "csrc/moe.cpp",
+    "csrc/experts_backward.cu"
 ]

compare_example.py CHANGED Viewed

@@ -7,10 +7,11 @@
 import time
 import torch
-from kernels import get_local_kernel
-from kernels import get_kernel
 from pathlib import Path
 from torch.nn import functional as F
 # Set seeds and deterministic flags for reproducibility
 torch.manual_seed(42)
@@ -19,11 +20,23 @@ torch.cuda.manual_seed_all(42)
 torch.backends.cudnn.deterministic = True
 torch.backends.cudnn.benchmark = False
-yamoe = get_kernel("drbh/yamoe", revision="v0.1.0")
 # Configuration
 batch_size, seq_len, hidden_dim = 4, 1024, 2880
-# batch_size, seq_len, hidden_dim = 4, 32, 1024
 num_experts, top_k = 8, 2
 # Create routing weights
@@ -52,6 +65,7 @@ torch.nn.init.trunc_normal_(gate_up_proj, std=0.02)
 torch.nn.init.trunc_normal_(down_proj, std=0.02)
 routing_weights = routing_weights.to(dtype=torch.float32, device="cuda")
 # Warmup
@@ -64,7 +78,7 @@ for _ in range(5):
         gate_up_proj_bias,
         down_proj,
         down_proj_bias,
-        seq_len,
         num_experts,
         top_k,
     )
@@ -74,6 +88,7 @@ torch.cuda.synchronize()
 torch.cuda.reset_peak_memory_stats()
 start = time.perf_counter()
 with torch.no_grad():
     output = yamoe.experts(
         hidden_states.view(-1, hidden_dim),
@@ -83,7 +98,7 @@ with torch.no_grad():
         gate_up_proj_bias,
         down_proj,
         down_proj_bias,
-        seq_len,
         num_experts,
         top_k,
     )
@@ -104,7 +119,7 @@ config.hidden_size = hidden_dim
 config.intermediate_size = 4 * hidden_dim
 config.num_local_experts = num_experts
-model = yamoe.reference.GptOssExperts(config)
 # set the weights and biases from above to the reference model
 model.gate_up_proj.data = gate_up_proj
@@ -133,79 +148,165 @@ ref_memory = peak_mem_mb
 # Reshape reference output to match kernel output
 ref_output_reshaped = ref_output.view(kernel_output.shape)
-# Calculate similarity metrics
-mse = torch.nn.functional.mse_loss(kernel_output, ref_output_reshaped).item()
-mae = torch.nn.functional.l1_loss(kernel_output, ref_output_reshaped).item()
 # Cosine similarity
 kernel_flat = kernel_output.view(-1)
 ref_flat = ref_output_reshaped.view(-1)
-cosine_sim = torch.nn.functional.cosine_similarity(
     kernel_flat.unsqueeze(0), ref_flat.unsqueeze(0)
 ).item()
 # Relative error (L2 norm of difference / L2 norm of reference)
-diff_norm = torch.norm(kernel_output - ref_output_reshaped).item()
 ref_norm = torch.norm(ref_output_reshaped).item()
-rel_error = diff_norm / ref_norm if ref_norm > 0 else float("inf")
 # Max absolute difference
-max_abs_diff = torch.max(torch.abs(kernel_output - ref_output_reshaped)).item()
 # Print comparison table
-print("\n" + "=" * 80)
-print(f"{'METRIC':<20} {'KERNEL':<15} {'REFERENCE':<15} {'SIMILARITY/SPEEDUP':<15}")
-print("=" * 80)
 print(
-    f"{'Sum':<20} {kernel_output.sum().item():<15.4f} {ref_output_reshaped.sum().item():<15.4f} {'N/A':<15}"
 )
 print(
-    f"{'Min':<20} {kernel_output.min().item():<15.4f} {ref_output_reshaped.min().item():<15.4f} {'N/A':<15}"
 )
 print(
-    f"{'Max':<20} {kernel_output.max().item():<15.4f} {ref_output_reshaped.max().item():<15.4f} {'N/A':<15}"
 )
 print(
-    f"{'Norm (L2)':<20} {kernel_output.norm().item():<15.4f} {ref_output_reshaped.norm().item():<15.4f} {'N/A':<15}"
 )
 print(
-    f"{'Std':<20} {kernel_output.std().item():<15.4f} {ref_output_reshaped.std().item():<15.4f} {'N/A':<15}"
 )
-print("-" * 80)
 print(
-    f"{'Time (ms)':<20} {kernel_time:<15.3f} {ref_time:<15.3f} {ref_time / kernel_time:<15.2f}x"
 )
 print(
-    f"{'Memory (MB)':<20} {kernel_memory:<15.2f} {ref_memory:<15.2f} {ref_memory / kernel_memory:<15.2f}x"
 )
-print("-" * 80)
-print("SIMILARITY METRICS")
-print("-" * 80)
-print(f"{'METRIC':<20} {'VALUE':<15} {'DIFFERENCE':<15}")
-print("-" * 80)
-print(f"{'MSE':<20} {mse:<15.6e} {'N/A':<15}")
-print(f"{'MAE':<20} {mae:<15.6e} {'N/A':<15}")
-print(f"{'Cosine Similarity':<20} {cosine_sim:<15.6f} {abs(1.0 - cosine_sim):<15.6f}")
-print(f"{'Relative Error':<20} {rel_error:<15.6e} {'N/A':<15}")
-print(f"{'Max Abs Diff':<20} {max_abs_diff:<15.6e} {'N/A':<15}")
-print("-" * 80)
 print("FIRST 10 ELEMENTS COMPARISON")
-print("-" * 80)
-# Get first 10 elements as numpy arrays for nice display
-kernel_first_10 = kernel_flat[:10].cpu().numpy()
-ref_first_10 = ref_flat[:10].cpu().numpy()
-diff_first_10 = kernel_first_10 - ref_first_10
-print(f"{'INDEX':<5} {'KERNEL':<12} {'REFERENCE':<12} {'DIFF':<12}")
-print("-" * 45)
-for i in range(10):
     print(
-        f"{i:<5} {kernel_first_10[i]:<12.6f} {ref_first_10[i]:<12.6f} {diff_first_10[i]:<12.6f}"
     )
-print("=" * 80)

 import time
 import torch
+from kernels import get_kernel, get_local_kernel
 from pathlib import Path
 from torch.nn import functional as F
+import numpy as np
+import sys
 # Set seeds and deterministic flags for reproducibility
 torch.manual_seed(42)
 torch.backends.cudnn.deterministic = True
 torch.backends.cudnn.benchmark = False
+np.set_printoptions(precision=4)
+load_method = 2  # 1: sym, 2: local, 3: hf
+if load_method == 1:
+    sys.path.insert(0, "./torch-ext")
+    import yamoe
+elif load_method == 2:
+    yamoe = get_local_kernel(Path("result"), "yamoe")
+elif load_method == 3:
+    yamoe = get_kernel("drbh/yamoe", revision="v0.1.0")
+binned_experts_ref = yamoe.vendored.yamoe_ref.binned_experts_ref
+GptOssExperts = yamoe.vendored.gpt_oss_mlp.GptOssExperts
 # Configuration
 batch_size, seq_len, hidden_dim = 4, 1024, 2880
 num_experts, top_k = 8, 2
 # Create routing weights
 torch.nn.init.trunc_normal_(down_proj, std=0.02)
 routing_weights = routing_weights.to(dtype=torch.float32, device="cuda")
+expert_capacity = batch_seq * top_k // num_experts * 2
 # Warmup
         gate_up_proj_bias,
         down_proj,
         down_proj_bias,
+        expert_capacity,
         num_experts,
         top_k,
     )
 torch.cuda.reset_peak_memory_stats()
 start = time.perf_counter()
 with torch.no_grad():
     output = yamoe.experts(
         hidden_states.view(-1, hidden_dim),
         gate_up_proj_bias,
         down_proj,
         down_proj_bias,
+        expert_capacity,
         num_experts,
         top_k,
     )
 config.intermediate_size = 4 * hidden_dim
 config.num_local_experts = num_experts
+model = GptOssExperts(config)
 # set the weights and biases from above to the reference model
 model.gate_up_proj.data = gate_up_proj
 # Reshape reference output to match kernel output
 ref_output_reshaped = ref_output.view(kernel_output.shape)
+# Test yamoe_ref implementation
+expert_capacity = batch_seq * top_k // num_experts * 2  # Generous capacity
+torch.cuda.synchronize()
+torch.cuda.reset_peak_memory_stats()
+start = time.perf_counter()
+with torch.no_grad():
+    yamoe_ref_output = binned_experts_ref(
+        hidden_states,
+        router_indices,
+        routing_weights,
+        gate_up_proj,
+        gate_up_proj_bias,
+        down_proj,
+        down_proj_bias,
+        expert_capacity,
+    )
+torch.cuda.synchronize()
+yamoe_ref_time = (time.perf_counter() - start) * 1e3
+yamoe_ref_memory = torch.cuda.max_memory_allocated() / (1024 * 1024)
+# Reshape yamoe_ref output to match kernel output
+yamoe_ref_output_reshaped = yamoe_ref_output.view(kernel_output.shape)
+# Calculate similarity metrics between kernel and reference
+mse_kernel_ref = torch.nn.functional.mse_loss(kernel_output, ref_output_reshaped).item()
+mae_kernel_ref = torch.nn.functional.l1_loss(kernel_output, ref_output_reshaped).item()
 # Cosine similarity
 kernel_flat = kernel_output.view(-1)
 ref_flat = ref_output_reshaped.view(-1)
+yamoe_ref_flat = yamoe_ref_output_reshaped.view(-1)
+cosine_sim_kernel_ref = torch.nn.functional.cosine_similarity(
     kernel_flat.unsqueeze(0), ref_flat.unsqueeze(0)
 ).item()
 # Relative error (L2 norm of difference / L2 norm of reference)
+diff_norm_kernel_ref = torch.norm(kernel_output - ref_output_reshaped).item()
 ref_norm = torch.norm(ref_output_reshaped).item()
+rel_error_kernel_ref = diff_norm_kernel_ref / ref_norm if ref_norm > 0 else float("inf")
 # Max absolute difference
+max_abs_diff_kernel_ref = torch.max(
+    torch.abs(kernel_output - ref_output_reshaped)
+).item()
+# Calculate similarity metrics between kernel and yamoe_ref
+mse_kernel_yamoe = torch.nn.functional.mse_loss(
+    kernel_output, yamoe_ref_output_reshaped
+).item()
+mae_kernel_yamoe = torch.nn.functional.l1_loss(
+    kernel_output, yamoe_ref_output_reshaped
+).item()
+cosine_sim_kernel_yamoe = torch.nn.functional.cosine_similarity(
+    kernel_flat.unsqueeze(0), yamoe_ref_flat.unsqueeze(0)
+).item()
+diff_norm_kernel_yamoe = torch.norm(kernel_output - yamoe_ref_output_reshaped).item()
+yamoe_ref_norm = torch.norm(yamoe_ref_output_reshaped).item()
+rel_error_kernel_yamoe = (
+    diff_norm_kernel_yamoe / yamoe_ref_norm if yamoe_ref_norm > 0 else float("inf")
+)
+max_abs_diff_kernel_yamoe = torch.max(
+    torch.abs(kernel_output - yamoe_ref_output_reshaped)
+).item()
+# Calculate similarity metrics between reference and yamoe_ref
+mse_ref_yamoe = torch.nn.functional.mse_loss(
+    ref_output_reshaped, yamoe_ref_output_reshaped
+).item()
+mae_ref_yamoe = torch.nn.functional.l1_loss(
+    ref_output_reshaped, yamoe_ref_output_reshaped
+).item()
+cosine_sim_ref_yamoe = torch.nn.functional.cosine_similarity(
+    ref_flat.unsqueeze(0), yamoe_ref_flat.unsqueeze(0)
+).item()
+diff_norm_ref_yamoe = torch.norm(ref_output_reshaped - yamoe_ref_output_reshaped).item()
+rel_error_ref_yamoe = (
+    diff_norm_ref_yamoe / yamoe_ref_norm if yamoe_ref_norm > 0 else float("inf")
+)
+max_abs_diff_ref_yamoe = torch.max(
+    torch.abs(ref_output_reshaped - yamoe_ref_output_reshaped)
+).item()
 # Print comparison table
+print("\n" + "=" * 110)
+print(
+    f"{'METRIC':<20} {'KERNEL':<15} {'REFERENCE':<15} {'YAMOE_REF':<15} {'KERNEL SPEEDUP':<20} {'REF SPEEDUP':<15}"
+)
+print("=" * 110)
 print(
+    f"{'Sum':<20} {kernel_output.sum().item():<15.4f} {ref_output_reshaped.sum().item():<15.4f} {yamoe_ref_output_reshaped.sum().item():<15.4f} {'N/A':<20} {'N/A':<15}"
 )
 print(
+    f"{'Min':<20} {kernel_output.min().item():<15.4f} {ref_output_reshaped.min().item():<15.4f} {yamoe_ref_output_reshaped.min().item():<15.4f} {'N/A':<20} {'N/A':<15}"
 )
 print(
+    f"{'Max':<20} {kernel_output.max().item():<15.4f} {ref_output_reshaped.max().item():<15.4f} {yamoe_ref_output_reshaped.max().item():<15.4f} {'N/A':<20} {'N/A':<15}"
 )
 print(
+    f"{'Norm (L2)':<20} {kernel_output.norm().item():<15.4f} {ref_output_reshaped.norm().item():<15.4f} {yamoe_ref_output_reshaped.norm().item():<15.4f} {'N/A':<20} {'N/A':<15}"
 )
 print(
+    f"{'Std':<20} {kernel_output.std().item():<15.4f} {ref_output_reshaped.std().item():<15.4f} {yamoe_ref_output_reshaped.std().item():<15.4f} {'N/A':<20} {'N/A':<15}"
 )
+print("-" * 110)
 print(
+    f"{'Time (ms)':<20} {kernel_time:<15.3f} {ref_time:<15.3f} {yamoe_ref_time:<15.3f} {yamoe_ref_time / kernel_time:<20.2f}x {yamoe_ref_time / ref_time:<15.2f}x"
 )
 print(
+    f"{'Memory (MB)':<20} {kernel_memory:<15.2f} {ref_memory:<15.2f} {yamoe_ref_memory:<15.2f} {yamoe_ref_memory / kernel_memory:<20.2f}x {yamoe_ref_memory / ref_memory:<15.2f}x"
 )
+print("-" * 110)
+print("SIMILARITY METRICS (vs KERNEL)")
+print("-" * 110)
+print(
+    f"{'METRIC':<20} {'KERNEL vs REF':<20} {'KERNEL vs YAMOE_REF':<20} {'REF vs YAMOE_REF':<20}"
+)
+print("-" * 110)
+print(
+    f"{'MSE':<20} {mse_kernel_ref:<20.6e} {mse_kernel_yamoe:<20.6e} {mse_ref_yamoe:<20.6e}"
+)
+print(
+    f"{'MAE':<20} {mae_kernel_ref:<20.6e} {mae_kernel_yamoe:<20.6e} {mae_ref_yamoe:<20.6e}"
+)
+print(
+    f"{'Cosine Similarity':<20} {cosine_sim_kernel_ref:<20.6f} {cosine_sim_kernel_yamoe:<20.6f} {cosine_sim_ref_yamoe:<20.6f}"
+)
+print(
+    f"{'Relative Error':<20} {rel_error_kernel_ref:<20.6e} {rel_error_kernel_yamoe:<20.6e} {rel_error_ref_yamoe:<20.6e}"
+)
+print(
+    f"{'Max Abs Diff':<20} {max_abs_diff_kernel_ref:<20.6e} {max_abs_diff_kernel_yamoe:<20.6e} {max_abs_diff_ref_yamoe:<20.6e}"
+)
+print("-" * 110)
 print("FIRST 10 ELEMENTS COMPARISON")
+print("-" * 110)
+# Get first N elements as numpy arrays for nice display
+N = 10
+kernel_first_10 = kernel_flat[:N].cpu().numpy()
+ref_first_10 = ref_flat[:N].cpu().numpy()
+yamoe_ref_first_10 = yamoe_ref_flat[:N].cpu().numpy()
+diff_kernel_ref = kernel_first_10 - ref_first_10
+diff_kernel_yamoe = kernel_first_10 - yamoe_ref_first_10
+print(
+    f"{'INDEX':<5} {'KERNEL':<12} {'REFERENCE':<12} {'YAMOE_REF':<12} {'K-R DIFF':<12} {'K-Y DIFF':<12}"
+)
+print("-" * 70)
+for i in range(N):
     print(
+        f"{i:<5} {kernel_first_10[i]:<12.6f} {ref_first_10[i]:<12.6f} {yamoe_ref_first_10[i]:<12.6f} {diff_kernel_ref[i]:<12.6f} {diff_kernel_yamoe[i]:<12.6f}"
     )
+print("=" * 110)

csrc/experts_backward.cu ADDED Viewed

	@@ -0,0 +1,341 @@

+// Backward pass for MoE experts
+#include <ATen/cuda/Atomic.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime.h>
+#include <torch/torch.h>
+void sort_cuda(torch::Tensor x,
+               int64_t end_bit,
+               torch::Tensor x_out,
+               torch::Tensor iota_out);
+void bincount_cumsum_cuda(torch::Tensor input,
+                          torch::Tensor &output,
+                          int64_t minlength);
+void gather_cuda(const torch::Tensor &x,
+                 const torch::Tensor &indices,
+                 const torch::Tensor &bins,
+                 torch::Tensor &output,
+                 int64_t E,
+                 int64_t C,
+                 int64_t top_k);
+void scatter_cuda(const torch::Tensor &src,
+                  const torch::Tensor &indices,
+                  const torch::Tensor &bins,
+                  const torch::Tensor &weights,
+                  torch::Tensor &y,
+                  int64_t T,
+                  int64_t E,
+                  int64_t C,
+                  int64_t top_k);
+torch::Tensor index_select_out_cuda(torch::Tensor out,
+                                    torch::Tensor in,
+                                    torch::Tensor idx_int32);
+// scatter gradients back to expert outputs and routing weights
+template <typename scalar_t>
+__global__ void binned_scatter_backward_kernel(
+    const scalar_t *__restrict__ grad_y,           // [T, H]
+    const int *__restrict__ indices,               // [S]
+    const int *__restrict__ bins,                  // [E+1]
+    const scalar_t *__restrict__ selected_weights, // [S]
+    const scalar_t *__restrict__ expert_output,    // [E, C, H]
+    scalar_t *__restrict__ grad_expert_output,     // [E, C, H]
+    scalar_t *__restrict__ grad_selected_weights,  // [S]
+    int T,
+    int K,
+    int H,
+    int E,
+    int C) {
+  int e = blockIdx.x;
+  int i = blockIdx.y;
+  if (e >= E || i >= C)
+    return;
+  const int start = (e == 0) ? 0 : bins[e - 1];
+  const int end = bins[e];
+  const int n_all = end - start;
+  const int take = (n_all > 0) ? min(n_all, C) : 0;
+  if (take == 0 || i >= take) {
+    scalar_t *dst = grad_expert_output + ((size_t)e * C + i) * H;
+    for (int h = threadIdx.x; h < H; h += blockDim.x)
+      dst[h] = scalar_t(0);
+    return;
+  }
+  const int sorted_pos = start + i;
+  const int flat_pos = indices[sorted_pos];
+  const int tok = flat_pos / K;
+  const scalar_t scale = selected_weights[sorted_pos];
+  const scalar_t *grad_y_ptr = grad_y + (size_t)tok * H;
+  scalar_t *grad_exp_ptr = grad_expert_output + ((size_t)e * C + i) * H;
+  const scalar_t *expert_ptr = expert_output + ((size_t)e * C + i) * H;
+  for (int h = threadIdx.x; h < H; h += blockDim.x) {
+    grad_exp_ptr[h] += grad_y_ptr[h] * scale;
+  }
+  if (threadIdx.x == 0) {
+    scalar_t sum = scalar_t(0);
+    for (int h = 0; h < H; ++h)
+      sum += grad_y_ptr[h] * expert_ptr[h];
+    gpuAtomicAdd(&grad_selected_weights[flat_pos], sum);
+  }
+}
+// gather gradients back to hidden states
+template <typename scalar_t>
+__global__ void binned_gather_backward_kernel(
+    const scalar_t *__restrict__ grad_x, // [E, C, H]
+    const int *__restrict__ indices,     // [S]
+    const int *__restrict__ bins,        // [E+1]
+    scalar_t *__restrict__ grad_hidden,  // [T, H]
+    int T,
+    int K,
+    int H,
+    int E,
+    int C) {
+  int e = blockIdx.x;
+  int i = blockIdx.y;
+  if (e >= E || i >= C)
+    return;
+  const int start = (e == 0) ? 0 : bins[e - 1];
+  const int end = bins[e];
+  const int n = min(max(end - start, 0), C);
+  if (i >= n)
+    return;
+  const int flat_pos = indices[start + i];
+  const int tok = flat_pos / K;
+  const scalar_t *gx = grad_x + ((size_t)e * C + i) * H;
+  scalar_t *gh = grad_hidden + (size_t)tok * H;
+  for (int h = threadIdx.x; h < H; h += blockDim.x) {
+    gpuAtomicAdd(&gh[h], gx[h]);
+  }
+}
+std::vector<torch::Tensor> experts_backward_cuda(
+    const torch::Tensor &grad_out,
+    const torch::Tensor &hidden_states,
+    const torch::Tensor &router_indices,
+    const torch::Tensor &routing_weights,
+    const torch::Tensor &gate_up_proj,
+    const torch::Tensor &gate_up_proj_bias,
+    const torch::Tensor &down_proj,
+    const torch::Tensor &down_proj_bias,
+    int64_t expert_capacity,
+    int64_t num_experts,
+    int64_t top_k) {
+  TORCH_CHECK(grad_out.is_cuda(), "grad_out must be CUDA");
+  TORCH_CHECK(hidden_states.is_cuda(), "hidden_states must be CUDA");
+  TORCH_CHECK(router_indices.is_cuda(), "router_indices must be CUDA");
+  TORCH_CHECK(routing_weights.is_cuda(), "routing_weights must be CUDA");
+  TORCH_CHECK(gate_up_proj.is_cuda() && down_proj.is_cuda(),
+              "weights must be CUDA");
+  TORCH_CHECK(gate_up_proj_bias.is_cuda() && down_proj_bias.is_cuda(),
+              "biases must be CUDA");
+  const at::cuda::OptionalCUDAGuard device_guard(grad_out.device());
+  const int64_t T = hidden_states.size(0);
+  const int64_t H = hidden_states.size(1);
+  const int64_t E = num_experts;
+  const int64_t C = expert_capacity;
+  const int64_t K = top_k;
+  TORCH_CHECK(router_indices.dim() == 2 && router_indices.size(0) == T &&
+                  router_indices.size(1) == K,
+              "router_indices must be [T, K]");
+  auto float_opts = hidden_states.options();
+  auto i32_opts = torch::TensorOptions()
+                      .device(hidden_states.device())
+                      .dtype(torch::kInt32);
+  // Sort tokens by expert ID
+  torch::Tensor flat_indices =
+      router_indices.contiguous().view({-1}).to(torch::kInt32);
+  torch::Tensor sorted_values = torch::empty_like(flat_indices);
+  torch::Tensor sorted_indices = torch::empty_like(flat_indices);
+  sort_cuda(flat_indices, 32, sorted_values, sorted_indices);
+  // Compute expert boundaries
+  torch::Tensor bins = torch::empty({E + 1}, i32_opts);
+  bincount_cumsum_cuda(sorted_values, bins, E);
+  cudaDeviceSynchronize();
+  // Gather tokens for each expert
+  torch::Tensor x = torch::empty({E, C, H}, float_opts);
+  gather_cuda(hidden_states.contiguous(), sorted_indices, bins, x, E, C, K);
+  // Gate-up projection
+  torch::Tensor gate_up = at::bmm(x.contiguous(), gate_up_proj.contiguous());
+  gate_up.add_(gate_up_proj_bias.unsqueeze(1));
+  // GLU activation (recompute forward)
+  auto gu_pair = gate_up.view({E, C, H, 2});
+  torch::Tensor pre_gate = gu_pair.select(3, 0);
+  torch::Tensor pre_up = gu_pair.select(3, 1);
+  const double limit = 7.0;
+  const double alpha = 1.702;
+  torch::Tensor gate_clamped = at::clamp_max(pre_gate, limit);
+  torch::Tensor up_clamped = at::clamp(pre_up, -limit, limit);
+  torch::Tensor s = at::sigmoid(gate_clamped * alpha);
+  torch::Tensor gate_act = gate_clamped * s;
+  torch::Tensor up_out = (1 + up_clamped) * gate_act;
+  // Down projection
+  torch::Tensor y_expert = at::bmm(up_out.contiguous(), down_proj.contiguous());
+  y_expert.add_(down_proj_bias.unsqueeze(1));
+  // Get routing weights in sorted order
+  torch::Tensor flat_router = router_indices.view({T, K});
+  torch::Tensor selected_2d;
+  if (routing_weights.size(1) == K) {
+    selected_2d = routing_weights.contiguous();
+  } else {
+    TORCH_CHECK(routing_weights.size(1) == E,
+                "routing_weights must be [T,K] or [T,E]");
+    selected_2d = at::gather(routing_weights, 1, flat_router.to(torch::kLong));
+  }
+  torch::Tensor selected_flat = selected_2d.contiguous().view({T * K});
+  torch::Tensor weights_sorted = torch::empty_like(selected_flat);
+  index_select_out_cuda(weights_sorted, selected_flat, sorted_indices);
+  // Initialize gradients
+  torch::Tensor dHidden = torch::zeros_like(hidden_states);
+  torch::Tensor dRouting;
+  torch::Tensor dWgu = torch::zeros_like(gate_up_proj);
+  torch::Tensor dbgu = torch::zeros_like(gate_up_proj_bias);
+  torch::Tensor dWd = torch::zeros_like(down_proj);
+  torch::Tensor dbd = torch::zeros_like(down_proj_bias);
+  // Reshape grad_out to [T,H]
+  TORCH_CHECK(grad_out.numel() == T * H || grad_out.numel() == T * K * H,
+              "grad_out numel must be T*H or T*K*H");
+  torch::Tensor grad_y = (grad_out.numel() == T * H)
+                             ? grad_out.contiguous().view({T, H})
+                             : grad_out.contiguous().view({T, K, H}).sum(1);
+  // Backward through scatter
+  torch::Tensor grad_expert_output = torch::zeros({E, C, H}, float_opts);
+  torch::Tensor grad_selected_weights = torch::zeros({T * K}, float_opts);
+  {
+    dim3 grid(E, C);
+    int threads = 256;
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::kHalf,
+        at::kBFloat16,
+        hidden_states.scalar_type(),
+        "binned_scatter_backward",
+        [&] {
+          using st = scalar_t;
+          binned_scatter_backward_kernel<st>
+              <<<grid, threads>>>(grad_y.data_ptr<st>(),
+                                  sorted_indices.data_ptr<int>(),
+                                  bins.data_ptr<int>(),
+                                  weights_sorted.data_ptr<st>(),
+                                  y_expert.data_ptr<st>(),
+                                  grad_expert_output.data_ptr<st>(),
+                                  grad_selected_weights.data_ptr<st>(),
+                                  (int)T,
+                                  (int)K,
+                                  (int)H,
+                                  (int)E,
+                                  (int)C);
+        });
+    cudaDeviceSynchronize();
+  }
+  // Route weight gradients
+  torch::Tensor grad_selected_flat = torch::zeros({T * K}, float_opts);
+  grad_selected_flat.index_add_(0,
+                                sorted_indices.to(torch::kLong),
+                                grad_selected_weights);
+  if (routing_weights.size(1) == E) {
+    torch::Tensor flat_grad_routing = torch::zeros_like(routing_weights);
+    flat_grad_routing.scatter_add_(1,
+                                   flat_router.to(torch::kLong),
+                                   grad_selected_flat.view({T, K}));
+    dRouting = flat_grad_routing;
+  } else {
+    dRouting = grad_selected_flat.view({T, K});
+  }
+  // Backward through down projection
+  dbd = grad_expert_output.sum(1);
+  torch::Tensor grad_intermediate =
+      torch::bmm(grad_expert_output.contiguous(),
+                 down_proj.transpose(1, 2).contiguous());
+  dWd = torch::bmm(up_out.transpose(1, 2).contiguous(),
+                   grad_expert_output.contiguous());
+  // Backward through GLU
+  torch::Tensor grad_up_plus_1 = grad_intermediate * gate_act;
+  torch::Tensor grad_glu = grad_intermediate * (up_clamped + 1);
+  torch::Tensor grad_up_clamped = grad_up_plus_1;
+  torch::Tensor sigmoid_gate = torch::sigmoid(gate_clamped * alpha);
+  torch::Tensor grad_gate_clamped =
+      grad_glu *
+      (sigmoid_gate + gate_clamped * sigmoid_gate * (1 - sigmoid_gate) * alpha);
+  // Unclamp gradients
+  torch::Tensor grad_gate = grad_gate_clamped.clone();
+  grad_gate.masked_fill_(pre_gate > limit, 0);
+  torch::Tensor grad_up = grad_up_clamped.clone();
+  grad_up.masked_fill_(pre_up > limit, 0);
+  grad_up.masked_fill_(pre_up < -limit, 0);
+  // Merge gate/up gradients
+  torch::Tensor grad_gate_up_pair = torch::zeros({E, C, H, 2}, float_opts);
+  grad_gate_up_pair.select(3, 0).copy_(grad_gate);
+  grad_gate_up_pair.select(3, 1).copy_(grad_up);
+  torch::Tensor grad_gate_up = grad_gate_up_pair.view({E, C, 2 * H});
+  // Backward through gate-up projection
+  dbgu = grad_gate_up.sum(1);
+  torch::Tensor grad_x = torch::bmm(grad_gate_up.contiguous(),
+                                    gate_up_proj.transpose(1, 2).contiguous());
+  dWgu = torch::bmm(x.transpose(1, 2).contiguous(), grad_gate_up.contiguous());
+  // Backward through gather
+  torch::Tensor grad_hidden = torch::zeros({T, H}, float_opts);
+  {
+    dim3 grid(E, C);
+    int threads = 256;
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::kHalf,
+                                    at::kBFloat16,
+                                    hidden_states.scalar_type(),
+                                    "binned_gather_backward",
+                                    [&] {
+                                      using st = scalar_t;
+                                      binned_gather_backward_kernel<st>
+                                          <<<grid, threads>>>(
+                                              grad_x.data_ptr<st>(),
+                                              sorted_indices.data_ptr<int>(),
+                                              bins.data_ptr<int>(),
+                                              grad_hidden.data_ptr<st>(),
+                                              (int)T,
+                                              (int)K,
+                                              (int)H,
+                                              (int)E,
+                                              (int)C);
+                                    });
+    cudaDeviceSynchronize();
+  }
+  dHidden += grad_hidden;
+  return {dHidden, dRouting, dWgu, dbgu, dWd, dbd};
+}

readme_example.py CHANGED Viewed

@@ -7,8 +7,7 @@
 import time
 import torch
-from kernels import get_local_kernel
-from kernels import get_kernel
 from pathlib import Path
 from torch.nn import functional as F
@@ -83,6 +82,8 @@ torch.cuda.synchronize()
 elapsed_ms = (time.perf_counter() - start) * 1e3
 peak_mem_mb = torch.cuda.max_memory_allocated() / (1024 * 1024)
-print(f"Output: sum={output.sum().item():.1f}, min={output.min().item():.1f}, max={output.max().item():.1f}")
 print(f"First 3: {output.view(-1)[:3].tolist()}")
 print(f"Time: {elapsed_ms:.1f}ms, Memory: {peak_mem_mb:.0f}MB")

 import time
 import torch
+from kernels import get_kernel, get_local_kernel
 from pathlib import Path
 from torch.nn import functional as F
 elapsed_ms = (time.perf_counter() - start) * 1e3
 peak_mem_mb = torch.cuda.max_memory_allocated() / (1024 * 1024)
+print(
+    f"Output: sum={output.sum().item():.1f}, min={output.min().item():.1f}, max={output.max().item():.1f}"
+)
 print(f"First 3: {output.view(-1)[:3].tolist()}")
 print(f"Time: {elapsed_ms:.1f}ms, Memory: {peak_mem_mb:.0f}MB")

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -67,6 +67,20 @@ TORCH_LIBRARY_EXPAND(
           "int num_experts, "
           "int top_k) -> Tensor");
   ops.impl("experts", torch::kCUDA, &experts_cuda);
 }
 REGISTER_EXTENSION(

           "int num_experts, "
           "int top_k) -> Tensor");
   ops.impl("experts", torch::kCUDA, &experts_cuda);
+  ops.def("experts_backward("
+          "Tensor grad_out, "
+          "Tensor hidden_states, "
+          "Tensor router_indices, "
+          "Tensor routing_weights, "
+          "Tensor gate_up_proj, "
+          "Tensor gate_up_proj_bias, "
+          "Tensor down_proj, "
+          "Tensor down_proj_bias, "
+          "int expert_capacity, "
+          "int num_experts, "
+          "int top_k) -> Tensor[]");
+  ops.impl("experts_backward", torch::kCUDA, &experts_backward_cuda);
 }
 REGISTER_EXTENSION(

torch-ext/torch_binding.h CHANGED Viewed

@@ -53,3 +53,19 @@ torch::Tensor experts_cuda(
     int64_t num_experts,             // E - number of experts
     int64_t top_k                    // K - top-k routing
 );

     int64_t num_experts,             // E - number of experts
     int64_t top_k                    // K - top-k routing
 );
+std::vector<torch::Tensor> experts_backward_cuda(
+    const torch::Tensor &grad_out,        // [T, H] - gradient from output
+    const torch::Tensor &hidden_states,   // [T, H] - original input
+    const torch::Tensor &router_indices,  // [T, K] - expert indices per token
+    const torch::Tensor &routing_weights, // [T, K] or [T, E] - routing weights
+    const torch::Tensor
+        &gate_up_proj, // [E, H, 2*H] - gate/up projection weights
+    const torch::Tensor
+        &gate_up_proj_bias,              // [E, 2*H] - gate/up projection bias
+    const torch::Tensor &down_proj,      // [E, H, H] - down projection weights
+    const torch::Tensor &down_proj_bias, // [E, H] - down projection bias
+    int64_t expert_capacity,             // C - capacity per expert
+    int64_t num_experts,                 // E - number of experts
+    int64_t top_k                        // K - top-k routing
+);

torch-ext/yamoe/__init__.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from ._ops import ops
-from . import reference
 gather = ops.gather
 scatter = ops.scatter
@@ -7,8 +9,14 @@ sort = ops.sort
 bincount_cumsum = ops.bincount_cumsum
 batch_mm = ops.batch_mm
 experts = ops.experts
 __all__ = [
     "shuffle",
     "gather",
     "scatter",
@@ -16,6 +24,9 @@ __all__ = [
     "bincount_cumsum",
     "batch_mm",
     "experts",
-    # Export the reference implementation
     "reference",
 ]

 from ._ops import ops
+from .layers import Yamoe
+from .vendored import yamoe_ref
+from .vendored import gpt_oss_mlp
 gather = ops.gather
 scatter = ops.scatter
 bincount_cumsum = ops.bincount_cumsum
 batch_mm = ops.batch_mm
 experts = ops.experts
+experts_backward = ops.experts_backward
 __all__ = [
+    # Debug
+    "ops",
+    # Layer (nn module)
+    "Yamoe",
+    # Functions
     "shuffle",
     "gather",
     "scatter",
     "bincount_cumsum",
     "batch_mm",
     "experts",
+    "experts_backward",
+    # Vendored reference implementations
     "reference",
+    "yamoe_ref",
+    "gpt_oss_mlp",
 ]

torch-ext/yamoe/layers.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import torch
+from ._ops import ops
+class Yamoe(torch.nn.Module):
+    """Yamoe MoE layer with routing and expert computation"""
+    can_torch_compile: bool = True
+    def __init__(self):
+        super().__init__()
+        # Pre-allocate buffers to avoid repeated allocations
+        self._routing_weights_buffer = None
+        self._batch_indices_buffer = None
+        self._last_batch_seq = None
+        self._last_num_experts = None
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size, seq_len, hidden_dim = hidden_states.shape
+        batch_seq = batch_size * seq_len
+        num_experts = getattr(self, "num_experts", 128)
+        top_k = getattr(self, "top_k", 4)
+        # Route tokens to experts
+        x_flat = hidden_states.view(-1, hidden_dim)
+        logits = torch.nn.functional.linear(
+            x_flat, self.router.weight, self.router.bias
+        )
+        # Compute top-k
+        if top_k == 1:
+            routing_weights, router_indices = logits.max(dim=-1, keepdim=True)
+        else:
+            routing_weights, router_indices = torch.topk(logits, top_k, dim=-1)
+        routing_weights = routing_weights.softmax(dim=-1)
+        # Create router scores
+        router_scores = (
+            torch.zeros_like(logits)
+            .scatter_(1, router_indices, routing_weights)
+            .transpose(0, 1)
+        )
+        # Convert routing_weights to sparse format [batch_seq, num_experts]
+        # Reuse buffer if possible to reduce allocations
+        if (
+            self._routing_weights_buffer is None
+            or self._last_batch_seq != batch_seq
+            or self._last_num_experts != num_experts
+            or self._routing_weights_buffer.device != routing_weights.device
+        ):
+            self._routing_weights_buffer = torch.zeros(
+                batch_seq,
+                num_experts,
+                device=routing_weights.device,
+                dtype=routing_weights.dtype,
+            )
+            self._batch_indices_buffer = (
+                torch.arange(batch_seq, device=routing_weights.device)
+                .unsqueeze(1)
+                .expand(-1, top_k)
+            )
+            self._last_batch_seq = batch_seq
+            self._last_num_experts = num_experts
+        else:
+            self._routing_weights_buffer.zero_()
+        # Fill sparse routing weights
+        flat_indices = router_indices.view(batch_seq, top_k)
+        flat_weights = routing_weights.view(batch_seq, top_k)
+        self._routing_weights_buffer[self._batch_indices_buffer, flat_indices] = (
+            flat_weights
+        )
+        # FIX: Use the correct expert projections
+        gate_up = self.experts.gate_up_proj[:, :, : hidden_dim * top_k].contiguous()
+        gate_up_bias = self.experts.gate_up_proj_bias[
+            :, : hidden_dim * top_k
+        ].contiguous()
+        down_proj = self.experts.down_proj[:, :hidden_dim, :].contiguous()
+        expert_capacity = batch_seq * top_k // num_experts * 2
+        with torch.no_grad():
+            # Compute expert output
+            output = ops.experts(
+                hidden_states.view(-1, hidden_dim),
+                router_indices,
+                self._routing_weights_buffer,
+                gate_up,
+                gate_up_bias,
+                down_proj,
+                self.experts.down_proj_bias,
+                expert_capacity,
+                num_experts,
+                top_k,
+            )
+        # Reshape output back to [B, S, H]
+        output = output.view(batch_size, seq_len, hidden_dim)
+        return output, router_scores

torch-ext/yamoe/{reference.py → vendored/gpt_oss_mlp.py} RENAMED Viewed

@@ -1,5 +1,14 @@
 import torch
-import torch.nn as nn
 class GptOssExperts(nn.Module):
     def __init__(self, config):
@@ -8,16 +17,26 @@ class GptOssExperts(nn.Module):
         self.num_experts = config.num_local_experts
         self.hidden_size = config.hidden_size
         self.expert_dim = self.intermediate_size
-        self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
-        self.gate_up_proj_bias = nn.Parameter(torch.empty(self.num_experts, 2 * self.expert_dim))
-        self.down_proj = nn.Parameter(torch.empty((self.num_experts, self.expert_dim, self.hidden_size)))
-        self.down_proj_bias = nn.Parameter(torch.empty(self.num_experts, self.hidden_size))
         self.alpha = 1.702
         self.limit = 7.0
-    def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
         """
-        When training is is more efficient to just loop over the experts and compute the output for each expert
         as otherwise the memory would explode.
         For inference we can sacrifice some memory and compute the output for all experts at once. By repeating the inputs.
@@ -29,45 +48,112 @@ class GptOssExperts(nn.Module):
         Returns:
             torch.Tensor
         """
-        # import ipdb; ipdb.set_trace()
         batch_size = hidden_states.shape[0]
-        hidden_states = hidden_states.reshape(-1, self.hidden_size)  # (num_tokens, hidden_size)
         num_experts = routing_weights.shape[1]
-        if self.training:
-            next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
             with torch.no_grad():
-                expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
                 expert_mask = expert_mask.permute(2, 1, 0)
-                # we sum on the top_k and on the sequence lenght to get which experts
                 # are hit this time around
-                expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-            for expert_idx in expert_hitted[:]:
                 with torch.no_grad():
-                    _, token_idx = torch.where(expert_mask[expert_idx[0]])
                 current_state = hidden_states[token_idx]
-                gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
                 gate, up = gate_up[..., ::2], gate_up[..., 1::2]
                 gate = gate.clamp(min=None, max=self.limit)
                 up = up.clamp(min=-self.limit, max=self.limit)
                 glu = gate * torch.sigmoid(gate * self.alpha)
                 gated_output = (up + 1) * glu
-                out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
-                weighted_output = out[0] * routing_weights[token_idx, expert_idx, None]
-                next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
             next_states = next_states.view(batch_size, -1, self.hidden_size)
         else:
             hidden_states = hidden_states.repeat(num_experts, 1)
             hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
-            gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :]
             gate, up = gate_up[..., ::2], gate_up[..., 1::2]
             gate = gate.clamp(min=None, max=self.limit)
             up = up.clamp(min=-self.limit, max=self.limit)
             glu = gate * torch.sigmoid(gate * self.alpha)
             next_states = torch.bmm(((up + 1) * glu), self.down_proj)
             next_states = next_states + self.down_proj_bias[..., None, :]
-            next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size)
-            next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
             next_states = next_states.sum(dim=0)
         return next_states

 import torch
+from torch import nn
+from torch.nn import functional as F
+def info(tensor, name):
+    print(name)
+    print(tensor.shape)
+    print(tensor.cpu())
+    print()
 class GptOssExperts(nn.Module):
     def __init__(self, config):
         self.num_experts = config.num_local_experts
         self.hidden_size = config.hidden_size
         self.expert_dim = self.intermediate_size
+        self.gate_up_proj = nn.Parameter(
+            torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim)
+        )
+        self.gate_up_proj_bias = nn.Parameter(
+            torch.empty(self.num_experts, 2 * self.expert_dim)
+        )
+        self.down_proj = nn.Parameter(
+            torch.empty((self.num_experts, self.expert_dim, self.hidden_size))
+        )
+        self.down_proj_bias = nn.Parameter(
+            torch.empty(self.num_experts, self.hidden_size)
+        )
         self.alpha = 1.702
         self.limit = 7.0
+    def forward(
+        self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None
+    ) -> torch.Tensor:
         """
+        When training it is more efficient to just loop over the experts and compute the output for each expert
         as otherwise the memory would explode.
         For inference we can sacrifice some memory and compute the output for all experts at once. By repeating the inputs.
         Returns:
             torch.Tensor
         """
         batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(
+            -1, self.hidden_size
+        )  # (num_tokens, hidden_size)
         num_experts = routing_weights.shape[1]
+        if hidden_states.device.type == "cpu" or self.training:
+            next_states = torch.zeros_like(
+                hidden_states, dtype=hidden_states.dtype, device=hidden_states.device
+            )
             with torch.no_grad():
+                expert_mask = torch.nn.functional.one_hot(
+                    router_indices, num_classes=num_experts
+                )
                 expert_mask = expert_mask.permute(2, 1, 0)
+                # we sum on the top_k and on the sequence length to get which experts
                 # are hit this time around
+                expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            for expert_idx in expert_hit[:]:
+                # expert_idx only have 1 element, so we can use scale for fast indexing
+                expert_idx = expert_idx[0]
                 with torch.no_grad():
+                    _, token_idx = torch.where(expert_mask[expert_idx])
                 current_state = hidden_states[token_idx]
+                gate_up = (
+                    current_state @ self.gate_up_proj[expert_idx]
+                    + self.gate_up_proj_bias[expert_idx]
+                )
                 gate, up = gate_up[..., ::2], gate_up[..., 1::2]
                 gate = gate.clamp(min=None, max=self.limit)
                 up = up.clamp(min=-self.limit, max=self.limit)
                 glu = gate * torch.sigmoid(gate * self.alpha)
                 gated_output = (up + 1) * glu
+                out = (
+                    gated_output @ self.down_proj[expert_idx]
+                    + self.down_proj_bias[expert_idx]
+                )
+                weighted_output = out * routing_weights[token_idx, expert_idx, None]
+                next_states.index_add_(
+                    0, token_idx, weighted_output.to(hidden_states.dtype)
+                )
             next_states = next_states.view(batch_size, -1, self.hidden_size)
         else:
             hidden_states = hidden_states.repeat(num_experts, 1)
             hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
+            gate_up = (
+                torch.bmm(hidden_states, self.gate_up_proj)
+                + self.gate_up_proj_bias[..., None, :]
+            )
             gate, up = gate_up[..., ::2], gate_up[..., 1::2]
             gate = gate.clamp(min=None, max=self.limit)
             up = up.clamp(min=-self.limit, max=self.limit)
             glu = gate * torch.sigmoid(gate * self.alpha)
             next_states = torch.bmm(((up + 1) * glu), self.down_proj)
             next_states = next_states + self.down_proj_bias[..., None, :]
+            next_states = next_states.view(
+                num_experts, batch_size, -1, self.hidden_size
+            )
+            next_states = (
+                next_states
+                * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[
+                    ..., None
+                ]
+            )
             next_states = next_states.sum(dim=0)
         return next_states
+class GptOssTopKRouter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.top_k = config.num_experts_per_tok
+        self.num_experts = config.num_local_experts
+        self.hidden_dim = config.hidden_size
+        self.weight = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim))
+        self.bias = nn.Parameter(torch.empty(self.num_experts))
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
+        router_logits = F.linear(
+            hidden_states, self.weight, self.bias
+        )  # (seq_len, num_experts)
+        router_top_value, router_indices = torch.topk(
+            router_logits, self.top_k, dim=-1
+        )  # (seq_len, top_k)
+        router_top_value = torch.nn.functional.softmax(
+            router_top_value, dim=1, dtype=router_top_value.dtype
+        )
+        router_scores = torch.zeros_like(router_logits).scatter_(
+            1, router_indices, router_top_value
+        )
+        return router_scores, router_indices
+# @use_kernel_forward_from_hub("MegaBlocksMoeMLP")
+class GptOssMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.router = GptOssTopKRouter(config)
+        self.experts = GptOssExperts(config)
+    def forward(self, hidden_states):
+        router_scores, router_indices = self.router(
+            hidden_states
+        )  # (num_experts, seq_len)
+        routed_out = self.experts(
+            hidden_states, router_indices=router_indices, routing_weights=router_scores
+        )
+        return routed_out, router_scores

torch-ext/yamoe/vendored/yamoe_ref.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch
+def binned_gather(x, indices, bins, expert_capacity, top_k):
+    E, H = bins.shape[0], x.shape[1]
+    out = torch.zeros((E, expert_capacity, H), device=x.device, dtype=x.dtype)
+    for e in range(E):
+        start = 0 if e == 0 else bins[e - 1]
+        end = bins[e]
+        n = min(end - start, expert_capacity)
+        for i in range(n):
+            flat_pos = indices[start + i]
+            tok = flat_pos // top_k
+            out[e, i] = x[tok]
+    return out
+def binned_scatter(x, indices, weights, bins, expert_capacity, top_k):
+    E, C, H = x.shape
+    N = indices.shape[0] // top_k
+    out = torch.zeros((N, top_k, H), dtype=x.dtype, device=x.device)
+    for e in range(E):
+        start = 0 if e == 0 else bins[e - 1]
+        end = bins[e]
+        n = end - start
+        if n == 0:
+            continue
+        take = min(n, expert_capacity)
+        for i in range(take):
+            flat_pos = indices[start + i]  # flattened (token, slot)
+            tok = flat_pos // top_k
+            slot = flat_pos % top_k
+            scale = weights[flat_pos] if weights is not None else 1.0
+            out[tok, slot] = x[e, i] * scale
+    return out.sum(dim=1)
+def sort_tokens_by_expert(router_indices, num_experts):
+    flat_indices = router_indices.flatten()
+    sorted_values, sorted_indices = torch.sort(flat_indices)
+    tokens_per_expert = torch.bincount(sorted_values, minlength=num_experts)
+    bins = torch.cumsum(tokens_per_expert, dim=0)
+    return sorted_indices, sorted_values, bins, tokens_per_expert
+def binned_experts_ref(
+    hidden_states,
+    router_indices,
+    routing_weights,
+    gate_up_proj,
+    gate_up_proj_bias,
+    down_proj,
+    down_proj_bias,
+    expert_capacity,
+):
+    B, S, H = hidden_states.shape
+    E, K = routing_weights.shape[1], router_indices.shape[1]
+    indices, _, bins, _ = sort_tokens_by_expert(router_indices, E)
+    x = binned_gather(hidden_states.view(-1, H), indices, bins, expert_capacity, K)
+    gate_up = torch.bmm(x, gate_up_proj) + gate_up_proj_bias[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    # clamp to limit
+    limit = 7.0
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * 1.702)
+    x = (up + 1) * glu
+    x = torch.bmm(x, down_proj) + down_proj_bias[..., None, :]
+    # build routing weights aligned to (token, slot)
+    flat_dense = routing_weights.view(-1, E)  # [B*S, E]
+    flat_router = router_indices.view(-1, K)  # [B*S, K]
+    selected = torch.gather(flat_dense, 1, flat_router).reshape(-1)  # [B*S*K]
+    # scatter back
+    y = binned_scatter(x, indices, selected, bins, expert_capacity, K)  # [B*S, H]
+    return y.view(B, S, H)