drbh commited on Aug 28

Commit

281d8ba

0 Parent(s):

feat: yet another moe

Files changed (19) hide show

.clang-format +288 -0
.gitattributes +2 -0
.gitignore +15 -0
.pre-commit-config.yaml +7 -0
README.md +117 -0
build.toml +36 -0
csrc/batch_mm.cu +31 -0
csrc/bincount_cumsum.cu +98 -0
csrc/gather.cu +109 -0
csrc/index_select.cu +51 -0
csrc/moe.cpp +223 -0
csrc/scatter.cu +147 -0
csrc/sort.cu +93 -0
flake.lock +168 -0
flake.nix +24 -0
torch-ext/torch_binding.cpp +73 -0
torch-ext/torch_binding.h +55 -0
torch-ext/yamoe/__init__.py +21 -0
torch-ext/yamoe/reference.py +73 -0

.clang-format ADDED Viewed

	@@ -0,0 +1,288 @@

+---
+Language:        Cpp
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: None
+AlignConsecutiveAssignments:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionDeclarations: false
+  AlignFunctionPointers: false
+  PadOperators:    true
+AlignConsecutiveBitFields:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionDeclarations: false
+  AlignFunctionPointers: false
+  PadOperators:    false
+AlignConsecutiveDeclarations:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionDeclarations: true
+  AlignFunctionPointers: false
+  PadOperators:    false
+AlignConsecutiveMacros:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionDeclarations: false
+  AlignFunctionPointers: false
+  PadOperators:    false
+AlignConsecutiveShortCaseStatements:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCaseArrows: false
+  AlignCaseColons: false
+AlignConsecutiveTableGenBreakingDAGArgColons:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionDeclarations: false
+  AlignFunctionPointers: false
+  PadOperators:    false
+AlignConsecutiveTableGenCondOperatorColons:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionDeclarations: false
+  AlignFunctionPointers: false
+  PadOperators:    false
+AlignConsecutiveTableGenDefinitionColons:
+  Enabled:         false
+  AcrossEmptyLines: false
+  AcrossComments:  false
+  AlignCompound:   false
+  AlignFunctionDeclarations: false
+  AlignFunctionPointers: false
+  PadOperators:    false
+AlignEscapedNewlines: Right
+AlignOperands:   Align
+AlignTrailingComments:
+  Kind:            Always
+  OverEmptyLines:  0
+AllowAllArgumentsOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowBreakBeforeNoexceptSpecifier: Never
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseExpressionOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortCompoundRequirementOnASingleLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: false
+AllowShortNamespacesOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AttributeMacros:
+  - __capability
+BinPackArguments: false
+BinPackParameters: false
+BitFieldColonSpacing: Both
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: Never
+  AfterEnum:       false
+  AfterExternBlock: false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile:     false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakAdjacentStringLiterals: true
+BreakAfterAttributes: Leave
+BreakAfterJavaFieldAnnotations: false
+BreakAfterReturnType: None
+BreakArrays:     true
+BreakBeforeBinaryOperators: None
+BreakBeforeConceptDeclarations: Always
+BreakBeforeBraces: Attach
+BreakBeforeInlineASMColon: OnlyMultiline
+BreakBeforeTernaryOperators: true
+BreakBinaryOperations: Never
+BreakConstructorInitializers: AfterColon
+BreakFunctionDefinitionParameters: true
+BreakInheritanceList: BeforeColon
+BreakStringLiterals: true
+BreakTemplateDeclarations: MultiLine
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineAfterAccessModifier: Never
+EmptyLineBeforeAccessModifier: LogicalBlock
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IfMacros:
+  - KJ_IF_MAYBE
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        1
+    SortPriority:    0
+    CaseSensitive:   false
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseBlocks: false
+IndentCaseLabels: false
+IndentExportBlock: true
+IndentExternBlock: AfterExternBlock
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentRequiresClause: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+InsertBraces:    false
+InsertNewlineAtEOF: false
+InsertTrailingCommas: None
+IntegerLiteralSeparator:
+  Binary:          0
+  BinaryMinDigits: 0
+  Decimal:         0
+  DecimalMinDigits: 0
+  Hex:             0
+  HexMinDigits:    0
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLines:
+  AtEndOfFile:     false
+  AtStartOfBlock:  true
+  AtStartOfFile:   true
+KeepFormFeed:    false
+LambdaBodyIndentation: Signature
+LineEnding:      DeriveLF
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MainIncludeChar: Quote
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCBreakBeforeNestedBlockParam: true
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PackConstructorInitializers: BinPack
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 0
+PenaltyBreakBeforeMemberAccess: 150
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakOpenParenthesis: 0
+PenaltyBreakScopeResolution: 500
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyIndentedWhitespace: 0
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+PPIndentWidth:   -1
+QualifierAlignment: Leave
+ReferenceAlignment: Pointer
+ReflowComments:  Always
+RemoveBracesLLVM: false
+RemoveEmptyLinesInUnwrappedLines: false
+RemoveParentheses: Leave
+RemoveSemicolon: false
+RequiresClausePosition: OwnLine
+RequiresExpressionIndentation: OuterScope
+SeparateDefinitionBlocks: Leave
+ShortNamespaceLines: 1
+SkipMacroDefinitionBody: false
+SortIncludes:    CaseSensitive
+SortJavaStaticImport: Before
+SortUsingDeclarations: LexicographicNumeric
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeJsonColon: false
+SpaceBeforeParens: ControlStatements
+SpaceBeforeParensOptions:
+  AfterControlStatements: true
+  AfterForeachMacros: true
+  AfterFunctionDefinitionName: false
+  AfterFunctionDeclarationName: false
+  AfterIfMacros:   true
+  AfterOverloadedOperator: false
+  AfterPlacementOperator: true
+  AfterRequiresInClause: false
+  AfterRequiresInExpression: false
+  BeforeNonEmptyParentheses: false
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  Never
+SpacesInContainerLiterals: true
+SpacesInLineCommentPrefix:
+  Minimum:         1
+  Maximum:         -1
+SpacesInParens:  Never
+SpacesInParensOptions:
+  ExceptDoubleParentheses: false
+  InCStyleCasts:   false
+  InConditionalStatements: false
+  InEmptyParentheses: false
+  Other:           false
+SpacesInSquareBrackets: false
+Standard:        Latest
+StatementAttributeLikeMacros:
+  - Q_EMIT
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TableGenBreakInsideDAGArg: DontBreak
+TabWidth:        8
+UseTab:          Never
+VerilogBreakBetweenInstancePorts: true
+WhitespaceSensitiveMacros:
+  - BOOST_PP_STRINGIZE
+  - CF_SWIFT_NAME
+  - NS_SWIFT_NAME
+  - PP_STRINGIZE
+  - STRINGIZE
+WrapNamespaceBodyWithEmptyLines: Leave
+...

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.so filter=lfs diff=lfs merge=lfs -text
2	+ *.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+.bak
+.ruff_cache
+.venv
+cmake
+result
+scripts
+__pycache__
+CMakeLists.txt
+setup.py
+pyproject.toml
+tests
+torch-ext/registration.h
+torch-ext/yamoe/_ops.py
+csrc/batch_mm.cu
+torch-ext/yamoe/*.abi3.so

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+repos:
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v20.1.8
+    hooks:
+      - id: clang-format
+        files: ^(csrc/|torch-ext/).*\.(?:c|cc|cpp|cxx|h|hh|hpp|hxx|cu|cuh)$
+        args: [-i]

README.md ADDED Viewed

	@@ -0,0 +1,117 @@

+---
+license: mit
+tags:
+  - kernel
+---
+```
+oooo    ooo  .oooo.   ooo. .oo.  .oo.    .ooooo.   .ooooo.
+ `88.  .8'  `P  )88b  `888P"Y88bP"Y88b  d88' `88b d88' `88b
+  `88..8'    .oP"888   888   888   888  888   888 888ooo888
+   `888'    d8(  888   888   888   888  888   888 888    .o
+    .8'     `Y888""8o o888o o888o o888o `Y8bod8P' `Y8bod8P'
+.o..P'
+`Y8P'
+                Yet Another Mixture of Experts
+```
+`yamoe` is a no nonsense, straightforward implementation of Mixture of Experts (MoE) kernels, designed to be super easy to use and be very computationally efficient.
+### Design goals
+- simplicity: easy to read and understand the code
+- efficiency: optimized for high throughput and low latency
+- low memory usage: optimized to handle large batch sizes
+- reproducibility: easy to reproduce results, no special new `sm` requirements
+### How to use
+```python
+# /// script
+# requires-python = "==3.10"
+# dependencies = ["torch==2.7.0", "triton", "numpy", "kernels"]
+# [tool.uv.sources]
+# kernels = { git = "https://github.com/huggingface/kernels.git" }
+# ///
+import time
+import torch
+from kernels import get_kernel
+from pathlib import Path
+from torch.nn import functional as F
+yamoe = get_kernel("drbh/yamoe")
+# Configuration
+torch.manual_seed(0)
+batch_size, seq_len, hidden_dim = 128, 2048, 2880
+num_experts, top_k = 32, 4
+# Create routing weights
+logits = torch.randn(batch_size, seq_len, num_experts)
+probs = F.softmax(logits, dim=-1)
+weights, indices = torch.topk(probs, top_k, dim=-1)
+batch_seq = batch_size * seq_len
+routing_weights = torch.zeros(batch_seq, num_experts, dtype=weights.dtype)
+flat_indices, flat_weights = indices.reshape(-1, top_k), weights.reshape(-1, top_k)
+batch_indices = torch.arange(batch_seq).unsqueeze(1).expand(-1, top_k)
+routing_weights[batch_indices, flat_indices] = flat_weights
+# Create model tensors (scaled to prevent overflow)
+hidden_states = torch.randn(batch_size, seq_len, hidden_dim).cuda().half() * 0.1
+gate_up_proj = torch.randn(num_experts, hidden_dim, 2 * hidden_dim).cuda().half() * 0.02
+gate_up_proj_bias = torch.zeros(num_experts, 2 * hidden_dim).cuda().half()
+down_proj = torch.randn(num_experts, hidden_dim, hidden_dim).cuda().half() * 0.02
+down_proj_bias = torch.zeros(num_experts, hidden_dim).cuda().half()
+routing_weights = routing_weights.cuda().half()
+router_indices = flat_indices.cuda()
+# Warmup
+for _ in range(5):
+    _ = yamoe.experts(
+        hidden_states.view(-1, hidden_dim),
+        router_indices,
+        routing_weights.view(-1, num_experts),
+        gate_up_proj,
+        gate_up_proj_bias,
+        down_proj,
+        down_proj_bias,
+        seq_len,
+        num_experts,
+        top_k,
+    )
+# Benchmark
+torch.cuda.synchronize()
+torch.cuda.reset_peak_memory_stats()
+start = time.perf_counter()
+with torch.no_grad():
+    output = yamoe.experts(
+        hidden_states.view(-1, hidden_dim),
+        router_indices,
+        routing_weights.view(-1, num_experts),
+        gate_up_proj,
+        gate_up_proj_bias,
+        down_proj,
+        down_proj_bias,
+        seq_len,
+        num_experts,
+        top_k,
+    )
+torch.cuda.synchronize()
+elapsed_ms = (time.perf_counter() - start) * 1e3
+peak_mem_mb = torch.cuda.max_memory_allocated() / (1024 * 1024)
+print(f"Output sum: {output.sum().item():.4f}")
+print(f"Kernel time: {elapsed_ms:.3f} ms")
+print(f"Peak GPU memory: {peak_mem_mb:.2f} MB")
+# Output sum: 124.2500
+# Kernel time: 85.722 ms
+# Peak GPU memory: 8403.40 MB
+```

build.toml ADDED Viewed

	@@ -0,0 +1,36 @@

+[general]
+name = "yamoe"
+universal = false
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h"
+]
+[kernel.yamoe]
+backend = "cuda"
+cuda-capabilities = [
+    "7.0",
+    "7.2",
+    "7.5",
+    "8.0",
+    "8.6",
+    "8.7",
+    "8.9",
+    "9.0",
+    "10.0",
+    "10.1",
+    "11.8",
+    "12.0"
+]
+depends = ["torch", "cutlass_3_8"]
+src = [
+    "csrc/index_select.cu",
+    "csrc/gather.cu",
+    "csrc/scatter.cu",
+    "csrc/sort.cu",
+    "csrc/bincount_cumsum.cu",
+    "csrc/batch_mm.cu",
+    "csrc/moe.cpp"
+]

csrc/batch_mm.cu ADDED Viewed

	@@ -0,0 +1,31 @@

+// csrc/batch_mm.cu
+#include <torch/torch.h>
+// Simply use a standard bmm for now but this can be adapted for
+// faster batched expert matrix multiply if needed
+torch::Tensor batch_mm(
+    torch::Tensor x,
+    torch::Tensor weights,
+    torch::Tensor batch_sizes,
+    torch::Tensor output,
+    bool trans_b) {
+  // Validate inputs
+  TORCH_CHECK(x.is_cuda(), "x must be on CUDA");
+  TORCH_CHECK(weights.is_cuda(), "weights must be on CUDA");
+  TORCH_CHECK(batch_sizes.is_cuda(), "batch_sizes must be on CUDA");
+  TORCH_CHECK(x.ndimension() == 3, "x must be 3D tensor"); // [E, C, H]
+  TORCH_CHECK(weights.ndimension() == 3,
+              "weights must be 3D tensor"); // [E, H, H_out]
+  TORCH_CHECK(batch_sizes.ndimension() == 1,
+              "batch_sizes must be 1D tensor"); // [E]
+  TORCH_CHECK(x.size(0) == weights.size(0) && x.size(0) == batch_sizes.size(0));
+  TORCH_CHECK(x.size(2) == weights.size(1)); // H dimension match
+  // For now, just fall back to bmm to test the binding
+  // torch::bmm(x, weights, output);
+  torch::bmm_out(output, x, weights);
+  return output;
+}

csrc/bincount_cumsum.cu ADDED Viewed

	@@ -0,0 +1,98 @@

+// csrc/bincount_cumsum.cu
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/torch.h>
+template <typename scalar_t>
+__global__ void bincount_cumsum_kernel(
+    const scalar_t *__restrict__ input,
+    int32_t *__restrict__ bins_out,
+    const int n_input,
+    const int n_bins) {
+  // Shared memory for local bincount
+  extern __shared__ int shared_counts[];
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+  int threads_per_block = blockDim.x;
+  // Initialize shared memory
+  for (int i = tid; i < n_bins; i += threads_per_block) {
+    shared_counts[i] = 0;
+  }
+  __syncthreads();
+  // Each block processes a chunk of input
+  int start = bid * threads_per_block;
+  int end = min(start + threads_per_block, n_input);
+  // Bincount phase - each thread processes its elements
+  for (int i = start + tid; i < end; i += threads_per_block) {
+    if (i < n_input) {
+      int bin = static_cast<int>(input[i]);
+      if (bin >= 0 && bin < n_bins) {
+        atomicAdd(&shared_counts[bin], 1);
+      }
+    }
+  }
+  __syncthreads();
+  // Write block results to global memory
+  for (int i = tid; i < n_bins; i += threads_per_block) {
+    atomicAdd(&bins_out[i], shared_counts[i]);
+  }
+  __syncthreads();
+  // Only first block does the cumsum
+  if (bid == 0) {
+    // Simple cumsum on first block
+    if (tid == 0) {
+      for (int i = 1; i < n_bins; i++) {
+        bins_out[i] += bins_out[i - 1];
+      }
+    }
+  }
+}
+void bincount_cumsum_cuda(
+    torch::Tensor input,
+    torch::Tensor &bins_out,
+    int64_t minlength) {
+  TORCH_CHECK(input.is_cuda(), "Input must be CUDA tensor");
+  TORCH_CHECK(input.dtype() == torch::kInt32, "Input must be int32");
+  TORCH_CHECK(bins_out.is_cuda(), "Output must be CUDA tensor");
+  const auto n_input = input.numel();
+  const auto n_bins = static_cast<int>(minlength);
+  // Validate output tensor dimensions and clear it
+  TORCH_CHECK(bins_out.numel() >= n_bins,
+              "Output tensor must have at least minlength elements");
+  bins_out.zero_();
+  const int threads_per_block = 256;
+  const int n_blocks = (n_input + threads_per_block - 1) / threads_per_block;
+  // Launch kernel with shared memory for bincount
+  const size_t shared_mem_size = n_bins * sizeof(int);
+  AT_DISPATCH_INTEGRAL_TYPES(
+      input.scalar_type(),
+      "bincount_cumsum_cuda",
+      ([&] {
+        bincount_cumsum_kernel<scalar_t>
+            <<<n_blocks, threads_per_block, shared_mem_size>>>(
+                input.data_ptr<scalar_t>(),
+                bins_out.data_ptr<int32_t>(),
+                n_input,
+                n_bins);
+      }));
+  cudaError_t err = cudaGetLastError();
+  TORCH_CHECK(err == cudaSuccess,
+              "CUDA kernel failed: ",
+              cudaGetErrorString(err));
+  // No return needed - output is modified in-place
+}

csrc/gather.cu ADDED Viewed

	@@ -0,0 +1,109 @@

+// csrc/gather.cu
+#include <cuda_runtime.h>
+#include <torch/torch.h>
+template <typename scalar_t>
+__global__ void gather_kernel(
+    const scalar_t *__restrict__ x, // [T,H]
+    const int *__restrict__ idx,    // [S]
+    const int *__restrict__ bins,   // [E] cumulative
+    scalar_t *__restrict__ out,     // [E,C,H]
+    int T,
+    int H,
+    int E,
+    int C,
+    int top_k) {
+  int e = blockIdx.x; // expert
+  int i = blockIdx.y; // row within capacity
+  if (e >= E || i >= C)
+    return;
+  const int end = bins[e];
+  const int start = (e == 0) ? 0 : bins[e - 1];
+  const int n = end - start;
+  bool valid = (i < n);
+  int tok = 0;
+  if (valid) {
+    int flat = idx[start + i];
+    tok = flat / top_k;
+    if (tok < 0 || tok >= T)
+      valid = false; // guard
+  }
+  const scalar_t *src = valid ? (x + (size_t)tok * H) : nullptr;
+  scalar_t *dst = out + ((size_t)e * C + i) * H;
+  int t = threadIdx.x;
+  // Try vectorized 16B moves if H is multiple of 4 and pointers are aligned
+  // (only for float type)
+  if constexpr (std::is_same<scalar_t, float>::value) {
+    if ((H % 4) == 0 && ((reinterpret_cast<uintptr_t>(dst) & 0xF) == 0) &&
+        (!valid || (reinterpret_cast<uintptr_t>(src) & 0xF) == 0)) {
+      const int HV = H / 4;
+      using F4 = float4;
+      const F4 *src4 = reinterpret_cast<const F4 *>(src);
+      F4 *dst4 = reinterpret_cast<F4 *>(dst);
+      for (int j = t; j < HV; j += blockDim.x) {
+        F4 v;
+        if (valid)
+          v = src4[j];
+        else
+          v = make_float4(0.f, 0.f, 0.f, 0.f);
+        dst4[j] = v;
+      }
+      return;
+    }
+  }
+  // Fallback to scalar copy
+  for (int j = t; j < H; j += blockDim.x) {
+    dst[j] = valid ? src[j] : scalar_t(0);
+  }
+}
+void gather_cuda(
+    torch::Tensor const &x,       // [T, H]
+    torch::Tensor const &indices, // [S]
+    torch::Tensor const &bins,    // [E] cumulative
+    torch::Tensor &output,        // [E, C, H] pre-allocated output buffer
+    int64_t E,                    // number of experts
+    int64_t C,                    // expert capacity
+    int64_t top_k                 // top-k value
+) {
+  // Get dimensions
+  int64_t T = x.size(0);
+  int64_t H = x.size(1);
+  // Validate output tensor dimensions
+  TORCH_CHECK(output.size(0) == E && output.size(1) == C && output.size(2) == H,
+              "Output tensor must have shape [E, C, H]");
+  // Launch kernel with 2D grid (E, C)
+  dim3 grid(E, C);
+  int threads = 256;
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::kHalf,
+                                  at::kBFloat16,
+                                  x.scalar_type(),
+                                  "gather_cuda",
+                                  ([&] {
+                                    using scalar_t_ =
+                                        scalar_t; // avoid shadowing surprises
+                                    gather_kernel<scalar_t_><<<grid, threads>>>(
+                                        x.data_ptr<scalar_t_>(),
+                                        indices.data_ptr<int>(),
+                                        bins.data_ptr<int>(),
+                                        output.data_ptr<scalar_t_>(),
+                                        (int)T,
+                                        (int)H,
+                                        (int)E,
+                                        (int)C,
+                                        (int)top_k);
+                                  }));
+  // No return needed - output is modified in-place
+}

csrc/index_select.cu ADDED Viewed

	@@ -0,0 +1,51 @@

+// csrc/index_select.cu
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_runtime.h>
+#include <torch/torch.h>
+template <typename scalar_t>
+__global__ void index_select_kernel(
+    const scalar_t *__restrict__ in,
+    const int32_t *__restrict__ idx,
+    scalar_t *__restrict__ out,
+    int64_t N) {
+  int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < N)
+    out[i] = in[(int64_t)idx[i]];
+}
+torch::Tensor index_select_out_cuda(
+    torch::Tensor out,       // [N], same dtype/device as in
+    torch::Tensor in,        // [M], contiguous
+    torch::Tensor idx_int32) // [N], int32, contiguous
+{
+  TORCH_CHECK(in.is_cuda() && idx_int32.is_cuda() && out.is_cuda(),
+              "cuda only");
+  TORCH_CHECK(idx_int32.dtype() == torch::kInt32, "idx must be int32");
+  TORCH_CHECK(
+      in.is_contiguous() && idx_int32.is_contiguous() && out.is_contiguous(),
+      "contiguous required");
+  int64_t N = idx_int32.numel();
+  int threads = 256;
+  int blocks = (int)((N + threads - 1) / threads);
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      torch::kBFloat16,
+      torch::kHalf,
+      in.scalar_type(),
+      "index_select_int32",
+      [&] {
+        const scalar_t *pin = in.data_ptr<scalar_t>();
+        const int32_t *pidx = idx_int32.data_ptr<int32_t>();
+        scalar_t *pout = out.data_ptr<scalar_t>();
+        index_select_kernel<scalar_t>
+            <<<blocks, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(pin,
+                                                                        pidx,
+                                                                        pout,
+                                                                        N);
+      });
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  return out;
+}

csrc/moe.cpp ADDED Viewed

	@@ -0,0 +1,223 @@

+// csrc/moe.cpp
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/torch.h>
+// Forward declarations for existing functions
+void sort_cuda(torch::Tensor x,
+               int64_t end_bit,
+               torch::Tensor x_out,
+               torch::Tensor iota_out);
+void bincount_cumsum_cuda(torch::Tensor input,
+                          torch::Tensor &output,
+                          int64_t minlength);
+torch::Tensor index_select_out_cuda(torch::Tensor out,
+                                    torch::Tensor in,
+                                    torch::Tensor idx_int32);
+void gather_cuda(torch::Tensor const &x,
+                 torch::Tensor const &indices,
+                 torch::Tensor const &bins,
+                 torch::Tensor &output,
+                 int64_t E,
+                 int64_t C,
+                 int64_t top_k);
+void scatter_cuda(torch::Tensor const &src,
+                  torch::Tensor const &indices,
+                  torch::Tensor const &bins,
+                  torch::Tensor const &weights,
+                  torch::Tensor &y,
+                  int64_t T,
+                  int64_t E,
+                  int64_t C,
+                  int64_t top_k);
+torch::Tensor batch_mm(torch::Tensor x,
+                       torch::Tensor weights,
+                       torch::Tensor batch_sizes,
+                       torch::Tensor output,
+                       bool trans_b = false);
+torch::Tensor experts_cuda(
+    torch::Tensor hidden_states,     // [B*S, H] - flattened hidden states
+    torch::Tensor router_indices,    // [B*S, K] - expert indices per token
+    torch::Tensor routing_weights,   // [B*S, E] or [B*S, K] - routing weights
+    torch::Tensor gate_up_proj,      // [E, H, 2*H] - gate/up projection weights
+    torch::Tensor gate_up_proj_bias, // [E, 2*H] - gate/up projection bias
+    torch::Tensor down_proj,         // [E, H, H] - down projection weights
+    torch::Tensor down_proj_bias,    // [E, H] - down projection bias
+    int64_t expert_capacity,         // C - capacity per expert
+    int64_t num_experts,             // E - number of experts
+    int64_t top_k                    // K - top-k routing
+) {
+  // Input validation
+  TORCH_CHECK(hidden_states.is_cuda(), "hidden_states must be on CUDA");
+  TORCH_CHECK(router_indices.is_cuda(), "router_indices must be on CUDA");
+  TORCH_CHECK(routing_weights.is_cuda(), "routing_weights must be on CUDA");
+  TORCH_CHECK(gate_up_proj.is_cuda(), "gate_up_proj must be on CUDA");
+  TORCH_CHECK(gate_up_proj_bias.is_cuda(), "gate_up_proj_bias must be on CUDA");
+  TORCH_CHECK(down_proj.is_cuda(), "down_proj must be on CUDA");
+  TORCH_CHECK(down_proj_bias.is_cuda(), "down_proj_bias must be on CUDA");
+  TORCH_CHECK(hidden_states.ndimension() == 2,
+              "hidden_states must be 2D [T, H]");
+  TORCH_CHECK(router_indices.ndimension() == 2,
+              "router_indices must be 2D [T, K]");
+  TORCH_CHECK(routing_weights.ndimension() == 2,
+              "routing_weights must be 2D [T, K]");
+  TORCH_CHECK(gate_up_proj.ndimension() == 3,
+              "gate_up_proj must be 3D [E, H, 2*H]");
+  TORCH_CHECK(gate_up_proj_bias.ndimension() == 2,
+              "gate_up_proj_bias must be 2D [E, 2*H]");
+  TORCH_CHECK(down_proj.ndimension() == 3, "down_proj must be 3D [E, H, H]");
+  TORCH_CHECK(down_proj_bias.ndimension() == 2,
+              "down_proj_bias must be 2D [E, H]");
+  const int64_t T = hidden_states.size(0); // Total tokens
+  const int64_t H = hidden_states.size(1); // Hidden size
+  const int64_t E = num_experts;
+  const int64_t C = expert_capacity;
+  const int64_t K = top_k;
+  TORCH_CHECK(router_indices.size(0) == T && router_indices.size(1) == K);
+  TORCH_CHECK(routing_weights.size(0) == T && (routing_weights.size(1) == K ||
+                                               routing_weights.size(1) == E),
+              "routing_weights must be [T, K] or [T, E]");
+  TORCH_CHECK(gate_up_proj.size(0) == E && gate_up_proj.size(1) == H &&
+              gate_up_proj.size(2) == 2 * H);
+  TORCH_CHECK(gate_up_proj_bias.size(0) == E &&
+              gate_up_proj_bias.size(1) == 2 * H);
+  TORCH_CHECK(down_proj.size(0) == E && down_proj.size(1) == H &&
+              down_proj.size(2) == H);
+  TORCH_CHECK(down_proj_bias.size(0) == E && down_proj_bias.size(1) == H);
+  // Ensure simple contiguity where helpful
+  hidden_states = hidden_states.contiguous();
+  router_indices = router_indices.contiguous();
+  routing_weights = routing_weights.contiguous();
+  // ALLOCATE
+  auto device_opts = torch::TensorOptions()
+                         .dtype(torch::kInt32)
+                         .device(hidden_states.device());
+  auto int64_opts = torch::TensorOptions()
+                        .dtype(torch::kInt64)
+                        .device(hidden_states.device());
+  auto float_opts = torch::TensorOptions()
+                        .dtype(hidden_states.dtype())
+                        .device(hidden_states.device());
+  // Buffers for sorting
+  torch::Tensor flat_indices =
+      router_indices.flatten().to(torch::kInt32, /*non_blocking=*/true);
+  torch::Tensor sorted_values = torch::empty_like(flat_indices);
+  torch::Tensor sorted_indices = torch::empty_like(flat_indices);
+  // Buffer for bins - use int32 for smaller footprint
+  torch::Tensor bins =
+      torch::empty({E + 1},
+                   device_opts); // Pre-allocate for bincount_cumsum result
+  // Buffer for gathered tokens
+  torch::Tensor x = torch::empty({E, C, H}, float_opts);
+  // Buffer for expert token counts
+  torch::Tensor expert_tokens = torch::empty({E}, device_opts);
+  // Buffers for intermediate results
+  torch::Tensor gate_up = torch::empty({E, C, 2 * H}, float_opts);
+  // Final output buffer
+  torch::Tensor output = torch::zeros_like(hidden_states);
+  // COMPUTE
+  // Sort tokens by expert
+  sort_cuda(flat_indices, 32, sorted_values, sorted_indices);
+  // Compute bins using bincount_cumsum
+  bincount_cumsum_cuda(sorted_values, bins, E);
+  // Gather tokens by expert
+  // [T, H] -> [E, C, H]
+  gather_cuda(hidden_states, sorted_indices, bins, x, E, C, K);
+  if (E > 1) {
+    expert_tokens.slice(0, 0, E - 1) =
+        bins.slice(0, 1, E) - bins.slice(0, 0, E - 1);
+    expert_tokens[E - 1] =
+        (int32_t)(flat_indices.size(0) - bins[E - 1].item<int32_t>());
+  } else {
+    expert_tokens[0] = (int32_t)flat_indices.size(0);
+  }
+  // Clamp to expert capacity
+  expert_tokens = torch::clamp(expert_tokens, 0, (int32_t)C);
+  batch_mm(x, gate_up_proj, expert_tokens, gate_up, true);
+  // add the gate bias to the output in-place
+  gate_up.add_(gate_up_proj_bias.unsqueeze(1));
+  // Compute GLU in-place, reusing gate_up buffer for output
+  auto gate = gate_up.index({torch::indexing::Ellipsis,
+                             torch::indexing::Slice(torch::indexing::None,
+                                                    torch::indexing::None,
+                                                    2)});
+  auto up =
+      gate_up.index({torch::indexing::Ellipsis,
+                     torch::indexing::Slice(1, torch::indexing::None, 2)});
+  const float limit = 7.0f;
+  gate = gate.clamp(/*min=*/c10::nullopt, /*max=*/limit);
+  up = up.clamp(/*min=*/-limit, /*max=*/limit);
+  gate.mul_(torch::sigmoid(gate * 1.702f));
+  up.add_(1).mul_(gate);
+  // Down projection uses GLU result directly
+  gate_up.resize_(0);
+  batch_mm(up, down_proj, expert_tokens, gate_up, true);
+  // add the down_bias in-place
+  gate_up.add_(down_proj_bias.unsqueeze(1));
+  // Stage allocations right before use
+  torch::Tensor selected_weights = torch::empty({T * K}, float_opts);
+  torch::Tensor weights_sorted = torch::empty({T * K}, float_opts);
+  torch::Tensor selected_weights_2d =
+      selected_weights.view({T, K}); // named lvalue view
+  torch::Tensor flat_dense = routing_weights.view({T, E});
+  torch::Tensor flat_router = router_indices.view({T, K});
+  // gather_out(out&, self, dim, index, sparse_grad=false)
+  at::gather_out(selected_weights_2d,
+                 flat_dense,
+                 /*dim=*/1,
+                 flat_router,
+                 /*sparse_grad=*/false);
+  // Use int32 index select to avoid dtype conversion
+  index_select_out_cuda(weights_sorted,                 // [T*K], float_opts
+                        selected_weights.view({T * K}), // const&, ok as rvalue
+                        sorted_indices // int32 indices, no conversion needed
+  );
+  // Scatter back to original positions with weights applied
+  scatter_cuda(gate_up.view({E, C, H}),
+               sorted_indices,
+               bins,
+               weights_sorted,
+               output,
+               T,
+               E,
+               C,
+               K);
+  return output;
+}

csrc/scatter.cu ADDED Viewed

	@@ -0,0 +1,147 @@

+// csrc/scatter.cu
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <torch/torch.h>
+#include <type_traits>
+// Minimal atomic add shim:
+// - native CUDA atomics for float/double
+// - 16-bit CAS fallback for Half/BFloat16 (works on all SMs)
+// CAS-based 16-bit atomic add (for c10::Half / c10::BFloat16)
+template <typename T>
+__device__ inline void atomicAdd16(
+    T *addr,
+    T val) {
+  // Find containing 32-bit word and whether we're the high or low 16 bits
+  std::uintptr_t uaddr = reinterpret_cast<std::uintptr_t>(addr);
+  unsigned int *base =
+      reinterpret_cast<unsigned int *>(uaddr & ~std::uintptr_t(0x3));
+  const bool hi_half = (uaddr & 0x2) != 0;
+  unsigned int old32 = *base, assumed;
+  do {
+    assumed = old32;
+    // Extract current 16-bit payload
+    unsigned short cur16 = hi_half ? (assumed >> 16) : (assumed & 0xFFFFu);
+    // Reinterpret those 16 bits as T, then promote to float
+    T cur;
+    *reinterpret_cast<unsigned short *>(&cur) = cur16;
+    float f = static_cast<float>(cur) + static_cast<float>(val);
+    // Convert back to T (rounds appropriately), grab its 16-bit payload
+    T res = static_cast<T>(f);
+    unsigned short res16 = *reinterpret_cast<unsigned short *>(&res);
+    // Merge back into the correct half and attempt CAS
+    unsigned int new32 =
+        hi_half ? ((assumed & 0x0000FFFFu) |
+                   (static_cast<unsigned int>(res16) << 16))
+                : ((assumed & 0xFFFF0000u) | static_cast<unsigned int>(res16));
+    old32 = atomicCAS(base, assumed, new32);
+  } while (old32 != assumed);
+}
+// Unified atomicAdd for all scalar_t
+template <typename T>
+__device__ inline void atomicAddT(
+    T *addr,
+    T val) {
+  if constexpr (std::is_same<T, float>::value) {
+    atomicAdd(addr, val);
+  } else if constexpr (std::is_same<T, double>::value) {
+    atomicAdd(addr, val);
+  } else {
+    // c10::Half or c10::BFloat16
+    atomicAdd16(addr, val);
+  }
+}
+// Kernel: y[tok, :] += src[e, i, :] for valid (e,i)
+// where tok = indices[bins[e-1] + i] / top_k
+template <typename scalar_t>
+__global__ void scatter_kernel(
+    const scalar_t *__restrict__ src,     // [E, C, H]
+    const int *__restrict__ idx,          // [S]
+    const int *__restrict__ bins,         // [E] cumulative
+    const scalar_t *__restrict__ weights, // [S] routing weights (can be null)
+    scalar_t *__restrict__ y,             // [T, H] (accumulated)
+    int T,
+    int H,
+    int E,
+    int C,
+    int top_k) {
+  int e = blockIdx.x;
+  int i = blockIdx.y;
+  if (e >= E || i >= C)
+    return;
+  const int end = bins[e];
+  const int start = (e == 0) ? 0 : bins[e - 1];
+  const int n = end - start;
+  bool valid = (i < n);
+  int tok = 0;
+  if (valid) {
+    int flat = idx[start + i];
+    tok = flat / top_k;
+    if (tok < 0 || tok >= T)
+      valid = false; // guard
+  }
+  if (!valid)
+    return;
+  const scalar_t *src_row = src + ((size_t)e * C + i) * H;
+  scalar_t *y_row = y + (size_t)tok * H;
+  // Get the weight/scale factor for this token if provided
+  scalar_t scale = (weights != nullptr) ? weights[start + i] : scalar_t(1.0);
+  int t = threadIdx.x;
+  for (int h = t; h < H; h += blockDim.x) {
+    atomicAddT(&y_row[h], src_row[h] * scale);
+  }
+}
+void scatter_cuda(
+    const torch::Tensor &src,     // [E, C, H]
+    const torch::Tensor &indices, // [S]  (int32)
+    const torch::Tensor &bins,    // [E]  cumulative (int32)
+    const torch::Tensor &weights, // [S]  routing weights (optional)
+    torch::Tensor &y,             // [T, H] (accumulate into)
+    int64_t T,                    // tokens
+    int64_t E,                    // experts
+    int64_t C,                    // capacity
+    int64_t top_k                 // router top-k
+) {
+  const int64_t H = src.size(2);
+  // Grid over experts x capacity; threads over H
+  dim3 grid(E, C);
+  int threads = 256;
+  // Include Half + BFloat16 in dispatch
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::kHalf,
+      at::kBFloat16,
+      src.scalar_type(),
+      "scatter_cuda",
+      ([&] {
+        using scalar_t_ = scalar_t;
+        scatter_kernel<scalar_t_><<<grid, threads>>>(
+            src.data_ptr<scalar_t_>(),
+            indices.data_ptr<int>(),
+            bins.data_ptr<int>(),
+            weights.defined() ? weights.data_ptr<scalar_t_>() : nullptr,
+            y.data_ptr<scalar_t_>(),
+            static_cast<int>(T),
+            static_cast<int>(H),
+            static_cast<int>(E),
+            static_cast<int>(C),
+            static_cast<int>(top_k));
+      }));
+}

csrc/sort.cu ADDED Viewed

	@@ -0,0 +1,93 @@

+// csrc/sort.cu
+// originally from
+// https://github.com/databricks/megablocks/blob/main/csrc/sort.h
+#include <c10/cuda/CUDAStream.h>
+#include <cstdint>
+#include <cub/cub.cuh>
+#include <torch/torch.h>
+#define CUDA_CALL(code)                                                        \
+  do {                                                                         \
+    cudaError_t status = (code);                                               \
+    std::string err = cudaGetErrorString(status);                              \
+    TORCH_CHECK(status == cudaSuccess, err);                                   \
+  } while (0)
+template <typename T>
+void cub_radix_sort(
+    torch::Tensor x,
+    int64_t end_bit,
+    torch::Tensor x_out,
+    torch::Tensor iota_out) {
+  // Get iota for values in sort.
+  auto iota_options =
+      torch::TensorOptions().dtype(x.scalar_type()).device(x.device());
+  torch::Tensor iota = torch::arange(0, x.numel(), iota_options);
+  // Get temporary buffer size.
+  size_t scratchpad_bytes = 0;
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+      /*d_temp_storage*/ nullptr,
+      /*temp_storage_bytes*/ scratchpad_bytes,
+      /*d_keys_in*/ x.data_ptr<T>(),
+      /*d_keys_out*/ x_out.data_ptr<T>(),
+      /*d_values_in*/ iota.data_ptr<T>(),
+      /*d_values_out*/ iota_out.data_ptr<T>(),
+      /*num_items*/ x.numel(),
+      /*begin_bit*/ 0,
+      /*end_bit*/ end_bit,
+      /*stream*/ c10::cuda::getCurrentCUDAStream()));
+  // Allocate scratchpad.
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(x.device());
+  torch::Tensor scratchpad =
+      torch::empty(static_cast<long>(scratchpad_bytes), options);
+  // Run the kernel.
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+      /*d_temp_storage*/ scratchpad.data_ptr(),
+      /*temp_storage_bytes*/ scratchpad_bytes,
+      /*d_keys_in*/ x.data_ptr<T>(),
+      /*d_keys_out*/ x_out.data_ptr<T>(),
+      /*d_values_in*/ iota.data_ptr<T>(),
+      /*d_values_out*/ iota_out.data_ptr<T>(),
+      /*num_items*/ x.numel(),
+      /*begin_bit*/ 0,
+      /*end_bit*/ end_bit,
+      /*stream*/ c10::cuda::getCurrentCUDAStream()));
+}
+void sort_cuda(
+    torch::Tensor x,
+    int64_t end_bit,
+    torch::Tensor x_out,
+    torch::Tensor iota_out) {
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 1);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+              x.scalar_type() == torch::kInt32 ||
+              x.scalar_type() == torch::kInt64);
+  TORCH_CHECK(x_out.is_cuda());
+  TORCH_CHECK(x_out.ndimension() == 1);
+  TORCH_CHECK(x_out.scalar_type() == x.scalar_type());
+  TORCH_CHECK(iota_out.is_cuda());
+  TORCH_CHECK(iota_out.ndimension() == 1);
+  TORCH_CHECK(iota_out.scalar_type() == x.scalar_type());
+  // Exit early if there is no work to do.
+  if (x_out.numel() == 0)
+    return;
+  switch (x.scalar_type()) {
+  case torch::kInt16:
+    return cub_radix_sort<short>(x, end_bit, x_out, iota_out);
+  case torch::kInt32:
+    return cub_radix_sort<int>(x, end_bit, x_out, iota_out);
+  default:
+    TORCH_CHECK(x.scalar_type() == torch::kInt64);
+    return cub_radix_sort<long>(x, end_bit, x_out, iota_out);
+  }
+}
+#undef CUDA_CALL

flake.lock ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1754038838,
+        "narHash": "sha256-oHigCT4z0ayyLyEuxdZooSXRAZP8lfOkZHzY1lx1U50=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "336f781fa284e193baa3d4c3ce3f95fb34e9ffad",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
+        "nixpkgs": [
+          "kernel-builder",
+          "hf-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1756320464,
+        "narHash": "sha256-x9LI4h87/Z9UgTQjgeG0fRcdeXl91xIqBlTauGKZM70=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "b4accba4496b28faef19a0487fbcf9686b14e2ef",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1752785354,
+        "narHash": "sha256-Y33ryUz7MPqKrZwlbQcsYCUz2jAJCacRf8jbs0tYUlA=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  description = "Flake for yamoe kernels";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+      pythonCheckInputs = pkgs: with pkgs; [
+        tqdm
+        py-cpuinfo
+        importlib-metadata
+        torchmetrics
+      ];
+    };
+}

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,73 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(
+    TORCH_EXTENSION_NAME,
+    ops) {
+  ops.def("gather("
+          "Tensor x, "
+          "Tensor indices, "
+          "Tensor bins, "
+          "Tensor! output, "
+          "int E, "
+          "int C, "
+          "int top_k) -> ()");
+  ops.impl("gather", torch::kCUDA, &gather_cuda);
+  ops.def("scatter("
+          "Tensor src, "
+          "Tensor indices, "
+          "Tensor bins, "
+          "Tensor weights, "
+          "Tensor! y, "
+          "int T, "
+          "int E, "
+          "int C, "
+          "int top_k) -> ()");
+  ops.impl("scatter", torch::kCUDA, &scatter_cuda);
+  ops.def("sort("
+          "Tensor x, "
+          "int end_bit, "
+          "Tensor! x_out, "
+          "Tensor! iota_out) -> ()");
+  ops.impl("sort", torch::kCUDA, &sort_cuda);
+  ops.def("bincount_cumsum("
+          "Tensor input, "
+          "Tensor! output, "
+          "int minlength) -> ()");
+  ops.impl("bincount_cumsum", torch::kCUDA, &bincount_cumsum_cuda);
+  ops.def("index_select_out("
+          "Tensor! out, "
+          "Tensor input, "
+          "Tensor idx_int32) -> Tensor");
+  ops.impl("index_select_out", torch::kCUDA, &index_select_out_cuda);
+  ops.def("batch_mm("
+          "Tensor x, "
+          "Tensor weights, "
+          "Tensor batch_sizes, "
+          "Tensor! output, "
+          "bool trans_b=False) -> Tensor");
+  ops.impl("batch_mm", torch::kCUDA, &batch_mm);
+  ops.def("experts("
+          "Tensor hidden_states, "
+          "Tensor router_indices, "
+          "Tensor routing_weights, "
+          "Tensor gate_up_proj, "
+          "Tensor gate_up_proj_bias, "
+          "Tensor down_proj, "
+          "Tensor down_proj_bias, "
+          "int expert_capacity, "
+          "int num_experts, "
+          "int top_k) -> Tensor");
+  ops.impl("experts", torch::kCUDA, &experts_cuda);
+}
+REGISTER_EXTENSION(
+    TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,55 @@

+#pragma once
+#include <torch/torch.h>
+void gather_cuda(torch::Tensor const &x,
+                 torch::Tensor const &indices,
+                 torch::Tensor const &bins,
+                 torch::Tensor &output,
+                 int64_t E,
+                 int64_t C,
+                 int64_t top_k);
+void scatter_cuda(torch::Tensor const &src,
+                  torch::Tensor const &indices,
+                  torch::Tensor const &bins,
+                  torch::Tensor const &weights,
+                  torch::Tensor &y,
+                  int64_t T,
+                  int64_t E,
+                  int64_t C,
+                  int64_t top_k);
+void sort_cuda(torch::Tensor x,
+               int64_t end_bit,
+               torch::Tensor x_out,
+               torch::Tensor iota_out);
+void bincount_cumsum_cuda(torch::Tensor input,
+                          torch::Tensor &output,
+                          int64_t minlength);
+torch::Tensor index_select_out_cuda(torch::Tensor out,
+                                    torch::Tensor in,
+                                    torch::Tensor idx_int32);
+torch::Tensor
+batch_mm(torch::Tensor x,           // [E, C, H] - expert tokens
+         torch::Tensor weights,     // [E, H, H_out] - expert weight matrices
+         torch::Tensor batch_sizes, // [E] - actual tokens per expert (<=C)
+         torch::Tensor output,      // [E, C, H_out] - output buffer
+         bool trans_b = false       // transpose weights if needed
+);
+torch::Tensor experts_cuda(
+    torch::Tensor hidden_states,     // [T, H] - flattened hidden states
+    torch::Tensor router_indices,    // [T, K] - expert indices per token
+    torch::Tensor routing_weights,   // [T, E] or [T, K] - routing weights
+    torch::Tensor gate_up_proj,      // [E, H, 2*H] - gate/up projection weights
+    torch::Tensor gate_up_proj_bias, // [E, 2*H] - gate/up projection bias
+    torch::Tensor down_proj,         // [E, H, H] - down projection weights
+    torch::Tensor down_proj_bias,    // [E, H] - down projection bias
+    int64_t expert_capacity,         // C - capacity per expert
+    int64_t num_experts,             // E - number of experts
+    int64_t top_k                    // K - top-k routing
+);

torch-ext/yamoe/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from ._ops import ops
+from . import reference
+gather = ops.gather
+scatter = ops.scatter
+sort = ops.sort
+bincount_cumsum = ops.bincount_cumsum
+batch_mm = ops.batch_mm
+experts = ops.experts
+__all__ = [
+    "shuffle",
+    "gather",
+    "scatter",
+    "sort",
+    "bincount_cumsum",
+    "batch_mm",
+    "experts",
+    # Export the reference implementation
+    "reference",
+]

torch-ext/yamoe/reference.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import torch
+import torch.nn as nn
+class GptOssExperts(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_size = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.hidden_size = config.hidden_size
+        self.expert_dim = self.intermediate_size
+        self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+        self.gate_up_proj_bias = nn.Parameter(torch.empty(self.num_experts, 2 * self.expert_dim))
+        self.down_proj = nn.Parameter(torch.empty((self.num_experts, self.expert_dim, self.hidden_size)))
+        self.down_proj_bias = nn.Parameter(torch.empty(self.num_experts, self.hidden_size))
+        self.alpha = 1.702
+        self.limit = 7.0
+    def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
+        """
+        When training is is more efficient to just loop over the experts and compute the output for each expert
+        as otherwise the memory would explode.
+        For inference we can sacrifice some memory and compute the output for all experts at once. By repeating the inputs.
+        Args:
+            hidden_states (torch.Tensor): (batch_size, seq_len, hidden_size)
+            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
+            routing_weights (torch.Tensor): (batch_size * token_num, num_experts)
+        Returns:
+            torch.Tensor
+        """
+        # import ipdb; ipdb.set_trace()
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)  # (num_tokens, hidden_size)
+        num_experts = routing_weights.shape[1]
+        if self.training:
+            next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
+            with torch.no_grad():
+                expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
+                expert_mask = expert_mask.permute(2, 1, 0)
+                # we sum on the top_k and on the sequence lenght to get which experts
+                # are hit this time around
+                expert_hitted = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            for expert_idx in expert_hitted[:]:
+                with torch.no_grad():
+                    _, token_idx = torch.where(expert_mask[expert_idx[0]])
+                current_state = hidden_states[token_idx]
+                gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
+                gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+                gate = gate.clamp(min=None, max=self.limit)
+                up = up.clamp(min=-self.limit, max=self.limit)
+                glu = gate * torch.sigmoid(gate * self.alpha)
+                gated_output = (up + 1) * glu
+                out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
+                weighted_output = out[0] * routing_weights[token_idx, expert_idx, None]
+                next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
+            next_states = next_states.view(batch_size, -1, self.hidden_size)
+        else:
+            hidden_states = hidden_states.repeat(num_experts, 1)
+            hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
+            gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :]
+            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+            gate = gate.clamp(min=None, max=self.limit)
+            up = up.clamp(min=-self.limit, max=self.limit)
+            glu = gate * torch.sigmoid(gate * self.alpha)
+            next_states = torch.bmm(((up + 1) * glu), self.down_proj)
+            next_states = next_states + self.down_proj_bias[..., None, :]
+            next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size)
+            next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
+            next_states = next_states.sum(dim=0)
+        return next_states