Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

.gitattributes +1 -0
README.md +8 -3
config.json +25 -0
model.safetensors +3 -0
modelling_trm.py +123 -0
special_tokens_map.json +7 -0
tokenizer_config.json +58 -0
vocab.txt +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,8 @@
----
-license: apache-2.0
----

+# Trained TRM Model
+This is a TRM model trained using the provided datasets.
+## How to use
+[More detailed usage instructions can be added here]

config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "H_cycles": 1,
+  "H_layers": 8,
+  "L_cycles": 1,
+  "L_layers": 2,
+  "act_epsilon": 0.01,
+  "act_threshold": 0.9,
+  "architectures": [
+    "TRM"
+  ],
+  "depth_H": 2,
+  "depth_L": 2,
+  "dropout": 0.1,
+  "dtype": "float32",
+  "expansion": 4,
+  "halt_epsilon": 0.01,
+  "halt_max_steps": 4,
+  "hidden_size": 32,
+  "model_type": "trm",
+  "num_heads": 4,
+  "pad_token_id": 0,
+  "seq_len": 4096,
+  "transformers_version": "4.57.0",
+  "vocab_size": 1183855
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89b827be0651807c55d7d4d3fcd1236efd8d9bcc1ff5ac64cd516718cede1383
+size 303611768

modelling_trm.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import math
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from einops.layers.torch import EinMix
+from transformers import PreTrainedModel, PretrainedConfig
+# ---------------------------
+# Configuration Class
+# ---------------------------
+class TRMConfig(PretrainedConfig):
+    model_type = "trm"
+    def __init__(self,
+                 vocab_size=32000,
+                 hidden_size=256,
+                 seq_len=128,
+                 depth_L=2,
+                 depth_H=2,
+                 act_threshold=0.9,
+                 act_epsilon=1e-2,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.seq_len = seq_len
+        self.depth_L = depth_L
+        self.depth_H = depth_H
+        self.act_threshold = act_threshold
+        self.act_epsilon = act_epsilon
+# ---------------------------
+# Model Architecture
+# ---------------------------
+class HaltingBlock(nn.Module):
+    def __init__(self, hidden_size, act_threshold, act_epsilon):
+        super().__init__()
+        self.proj = nn.Linear(hidden_size, hidden_size)
+        self.act_proj = nn.Linear(hidden_size, 1)
+        self.act_threshold = act_threshold
+        self.act_epsilon = act_epsilon
+    def forward(self, x):
+        halting_probs = torch.sigmoid(self.act_proj(x))
+        remainders = torch.zeros_like(halting_probs)
+        n_updates = torch.zeros_like(halting_probs)
+        still_running = torch.ones_like(halting_probs, dtype=torch.bool)
+        accumulated_output = torch.zeros_like(x)
+        accumulated_prob = torch.zeros_like(halting_probs)
+        while still_running.any():
+            p = torch.where(still_running, halting_probs, torch.zeros_like(halting_probs))
+            new_accum = accumulated_prob + p
+            still_running = new_accum < self.act_threshold
+            remainder = torch.where(still_running, torch.zeros_like(halting_probs), 1 - accumulated_prob)
+            update_weights = torch.where(still_running, p, remainder)
+            accumulated_output += update_weights * torch.tanh(self.proj(x))
+            accumulated_prob += update_weights
+            n_updates += still_running.float()
+            if (1 - accumulated_prob).mean() < self.act_epsilon:
+                break
+        return accumulated_output, accumulated_prob.mean()
+class TRMLayer(nn.Module):
+    def __init__(self, hidden_size, depth_H, act_threshold, act_epsilon):
+        super().__init__()
+        self.blocks = nn.ModuleList([
+            HaltingBlock(hidden_size, act_threshold, act_epsilon) for _ in range(depth_H)
+        ])
+        self.norm = nn.LayerNorm(hidden_size)
+    def forward(self, x):
+        for block in self.blocks:
+            x, _ = block(x)
+        return self.norm(x)
+class TRM(PreTrainedModel):
+    config_class = TRMConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.emb = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.pos_emb = nn.Parameter(torch.zeros(1, config.seq_len, config.hidden_size))
+        self.layers = nn.ModuleList([
+            TRMLayer(config.hidden_size, config.depth_H, config.act_threshold, config.act_epsilon)
+            for _ in range(config.depth_L)
+        ])
+        self.norm = nn.LayerNorm(config.hidden_size)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def forward(self, input_ids, labels=None):
+        x = self.emb(input_ids) + self.pos_emb[:, :input_ids.size(1), :]
+        for layer in self.layers:
+            x = layer(x)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        return {"loss": loss, "logits": logits}
+# ---------------------------
+# Utility: Register to AutoClasses
+# ---------------------------
+from transformers import AutoConfig, AutoModel
+AutoConfig.register("trm", TRMConfig)
+AutoModel.register(TRMConfig, TRM)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff