xueyunlong
/

NanoChat-0.3B-base

PyTorch

nanochat

custom_code

Model card Files Files and versions

xet

Community

xueyunlong commited on Mar 12

Commit

48e0294

verified ·

1 Parent(s): 7fdf68d

Upload 10 files

Browse files

Files changed (3) hide show

LMConfig.py +4 -5
config.json +3 -3
model.py +21 -8

LMConfig.py CHANGED Viewed

@@ -7,8 +7,8 @@ class LMConfig(PretrainedConfig):
     def __init__(
             self,
-            dim: int = 512,
-            n_layers: int = 8,
             tie_word_embeddings: bool = True,
             ###########################################
             attention:str='GQA',
@@ -27,7 +27,7 @@ class LMConfig(PretrainedConfig):
             hidden_dim: int = None,
             multiple_of: int = 64,
             norm_eps: float = 1e-5,
-            max_seq_len: int = 8192,
             rope_theta: int = 1e6,
             dropout: float = 0.0,
             flash_attn: bool = True,
@@ -46,10 +46,9 @@ class LMConfig(PretrainedConfig):
             norm_topk_prob: bool = True,
             **kwargs,
     ):
-        super().__init__(**kwargs)
         self.dim = dim
         self.n_layers = n_layers
-        self.tie_word_embeddings = tie_word_embeddings
         self.vocab_size = vocab_size
         self.hidden_dim = hidden_dim
         self.multiple_of = multiple_of

     def __init__(
             self,
+            dim: int = 896,
+            n_layers: int = 24,
             tie_word_embeddings: bool = True,
             ###########################################
             attention:str='GQA',
             hidden_dim: int = None,
             multiple_of: int = 64,
             norm_eps: float = 1e-5,
+            max_seq_len: int = 512,
             rope_theta: int = 1e6,
             dropout: float = 0.0,
             flash_attn: bool = True,
             norm_topk_prob: bool = True,
             **kwargs,
     ):
+        super().__init__(tie_word_embeddings=tie_word_embeddings,**kwargs)
         self.dim = dim
         self.n_layers = n_layers
         self.vocab_size = vocab_size
         self.hidden_dim = hidden_dim
         self.multiple_of = multiple_of

config.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
-  "_name_or_path": "out/",
   "architectures": [
     "NanoChatLM"
   ],
   "attention": "GQA",
   "auto_map": {
     "AutoConfig": "LMConfig.LMConfig",
-    "AutoModelForCausalLM": "model.NanoChatLM",
-    "AutoModel": "model.NanoChatLM"
   },
   "aux_loss_alpha": 0.1,
   "dim": 896,

 {
+  "_name_or_path": "NanoChat-0.3B-base/",
   "architectures": [
     "NanoChatLM"
   ],
   "attention": "GQA",
   "auto_map": {
     "AutoConfig": "LMConfig.LMConfig",
+    "AutoModel": "model.NanoChatLM",
+    "AutoModelForCausalLM": "model.NanoChatLM"
   },
   "aux_loss_alpha": 0.1,
   "dim": 896,

model.py CHANGED Viewed

@@ -397,11 +397,18 @@ class NanoChatLM(PreTrainedModel):
         self.layers = nn.ModuleList([NanoChatBlock(l, params) for l in range(self.n_layers)])
         self.norm = RMSNorm(params.dim, eps=params.norm_eps)
         self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
-        if params.tie_word_embeddings:
-            self.output.weight = self.tok_embeddings.weight
         self.register_buffer("pos_cis", precompute_pos_cis(params.dim // params.n_heads, params.max_seq_len,
                                                            theta=params.rope_theta), persistent=False)
         self.OUT = CausalLMOutputWithPast()
     def forward(self,
                 input_ids: Optional[torch.Tensor] = None,
@@ -429,16 +436,16 @@ class NanoChatLM(PreTrainedModel):
     @torch.inference_mode()
     def generate(self, input_ids, eos_token_id=151643, max_new_tokens=1024, temperature=0.75, top_p=0.90,
-                 stream=False, rp=1., use_cache=True, pad_token_id=0, **args):
         # 流式生成
         if stream:
-            return self._generate_stream(input_ids, eos_token_id, max_new_tokens, temperature, top_p, rp, use_cache)
         # 直接生成
         generated = []
         for i in range(input_ids.size(0)):
             non_pad = input_ids[i][input_ids[i] != pad_token_id].unsqueeze(0)
-            out = self._generate_stream(non_pad, eos_token_id, max_new_tokens, temperature, top_p, rp, use_cache)
             tokens_list = [tokens[:, -1:] for tokens in out]
             gen = torch.cat(tokens_list, dim=-1) if tokens_list else non_pad
             full_sequence = torch.cat([non_pad, gen], dim=-1)
@@ -452,14 +459,14 @@ class NanoChatLM(PreTrainedModel):
         ]
         return torch.cat(generated, dim=0)
-    def _generate_stream(self, input_ids, eos_token_id, max_new_tokens, temperature, top_p, rp, use_cache, **args):
         start, first_seq, past_kvs = input_ids.shape[1], True, None
         while input_ids.shape[1] < max_new_tokens - 1:
             if first_seq or not use_cache:
                 out, first_seq = self(input_ids, past_key_values=past_kvs, use_cache=use_cache), False
             else:
                 out = self(input_ids[:, -1:], past_key_values=past_kvs, use_cache=use_cache,
-                           start_pos=input_ids.shape[1] - 1)
             logits, past_kvs = out.logits[:, -1, :], out.past_key_values
             logits[:, list(set(input_ids.tolist()[0]))] /= rp
             logits /= (temperature + 1e-9)
@@ -472,7 +479,13 @@ class NanoChatLM(PreTrainedModel):
                 sorted_indices_to_remove[:, 0] = False
                 indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
                 logits[indices_to_remove] = -float('Inf')
-            input_ids_next = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
             input_ids = torch.cat((input_ids, input_ids_next), dim=1)
             yield input_ids[:, start:]
             if input_ids_next.item() == eos_token_id:

         self.layers = nn.ModuleList([NanoChatBlock(l, params) for l in range(self.n_layers)])
         self.norm = RMSNorm(params.dim, eps=params.norm_eps)
         self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
+        # if params.tie_word_embeddings:
+        #     self.output.weight = self.tok_embeddings.weight
         self.register_buffer("pos_cis", precompute_pos_cis(params.dim // params.n_heads, params.max_seq_len,
                                                            theta=params.rope_theta), persistent=False)
         self.OUT = CausalLMOutputWithPast()
+        self.post_init()
+    def tie_weights(self):
+        super().tie_weights()
+        if self.params.tie_word_embeddings:
+            self.output.weight = self.tok_embeddings.weight
     def forward(self,
                 input_ids: Optional[torch.Tensor] = None,
     @torch.inference_mode()
     def generate(self, input_ids, eos_token_id=151643, max_new_tokens=1024, temperature=0.75, top_p=0.90,
+                stream=False, rp=1., use_cache=True, pad_token_id=0, do_sample=True, **args):
         # 流式生成
         if stream:
+            return self._generate_stream(input_ids, eos_token_id, max_new_tokens, temperature, top_p, rp, use_cache, do_sample)
         # 直接生成
         generated = []
         for i in range(input_ids.size(0)):
             non_pad = input_ids[i][input_ids[i] != pad_token_id].unsqueeze(0)
+            out = self._generate_stream(non_pad, eos_token_id, max_new_tokens, temperature, top_p, rp, use_cache, do_sample)
             tokens_list = [tokens[:, -1:] for tokens in out]
             gen = torch.cat(tokens_list, dim=-1) if tokens_list else non_pad
             full_sequence = torch.cat([non_pad, gen], dim=-1)
         ]
         return torch.cat(generated, dim=0)
+    def _generate_stream(self, input_ids, eos_token_id, max_new_tokens, temperature, top_p, rp, use_cache, do_sample, **args):
         start, first_seq, past_kvs = input_ids.shape[1], True, None
         while input_ids.shape[1] < max_new_tokens - 1:
             if first_seq or not use_cache:
                 out, first_seq = self(input_ids, past_key_values=past_kvs, use_cache=use_cache), False
             else:
                 out = self(input_ids[:, -1:], past_key_values=past_kvs, use_cache=use_cache,
+                        start_pos=input_ids.shape[1] - 1)
             logits, past_kvs = out.logits[:, -1, :], out.past_key_values
             logits[:, list(set(input_ids.tolist()[0]))] /= rp
             logits /= (temperature + 1e-9)
                 sorted_indices_to_remove[:, 0] = False
                 indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
                 logits[indices_to_remove] = -float('Inf')
+            if do_sample:
+                input_ids_next = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
+            else:
+                # Greedy decoding: choose the token with the highest probability
+                input_ids_next = torch.argmax(F.softmax(logits, dim=-1), dim=-1).unsqueeze(-1)
             input_ids = torch.cat((input_ids, input_ids_next), dim=1)
             yield input_ids[:, start:]
             if input_ids_next.item() == eos_token_id: