Small fix
Browse files- config.json +0 -3
- configuration_deepseek.py +0 -11
- modeling_deepseek.py +1 -2
    	
        config.json
    CHANGED
    
    | @@ -9,7 +9,6 @@ | |
| 9 | 
             
                "AutoModel": "modeling_deepseek.DeepseekV3Model",
         | 
| 10 | 
             
                "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
         | 
| 11 | 
             
              },
         | 
| 12 | 
            -
              "aux_loss_alpha": 0.001,
         | 
| 13 | 
             
              "bos_token_id": 0,
         | 
| 14 | 
             
              "eos_token_id": 1,
         | 
| 15 | 
             
              "ep_size": 1,
         | 
| @@ -32,7 +31,6 @@ | |
| 32 | 
             
              "num_hidden_layers": 61,
         | 
| 33 | 
             
              "num_key_value_heads": 128,
         | 
| 34 | 
             
              "num_nextn_predict_layers": 1,
         | 
| 35 | 
            -
              "pretraining_tp": 1,
         | 
| 36 | 
             
              "q_lora_rank": 1536,
         | 
| 37 | 
             
              "qk_nope_head_dim": 128,
         | 
| 38 | 
             
              "qk_rope_head_dim": 64,
         | 
| @@ -58,7 +56,6 @@ | |
| 58 | 
             
              "rope_theta": 10000,
         | 
| 59 | 
             
              "routed_scaling_factor": 2.5,
         | 
| 60 | 
             
              "scoring_func": "sigmoid",
         | 
| 61 | 
            -
              "seq_aux": true,
         | 
| 62 | 
             
              "tie_word_embeddings": false,
         | 
| 63 | 
             
              "topk_group": 4,
         | 
| 64 | 
             
              "topk_method": "noaux_tc",
         | 
|  | |
| 9 | 
             
                "AutoModel": "modeling_deepseek.DeepseekV3Model",
         | 
| 10 | 
             
                "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
         | 
| 11 | 
             
              },
         | 
|  | |
| 12 | 
             
              "bos_token_id": 0,
         | 
| 13 | 
             
              "eos_token_id": 1,
         | 
| 14 | 
             
              "ep_size": 1,
         | 
|  | |
| 31 | 
             
              "num_hidden_layers": 61,
         | 
| 32 | 
             
              "num_key_value_heads": 128,
         | 
| 33 | 
             
              "num_nextn_predict_layers": 1,
         | 
|  | |
| 34 | 
             
              "q_lora_rank": 1536,
         | 
| 35 | 
             
              "qk_nope_head_dim": 128,
         | 
| 36 | 
             
              "qk_rope_head_dim": 64,
         | 
|  | |
| 56 | 
             
              "rope_theta": 10000,
         | 
| 57 | 
             
              "routed_scaling_factor": 2.5,
         | 
| 58 | 
             
              "scoring_func": "sigmoid",
         | 
|  | |
| 59 | 
             
              "tie_word_embeddings": false,
         | 
| 60 | 
             
              "topk_group": 4,
         | 
| 61 | 
             
              "topk_method": "noaux_tc",
         | 
    	
        configuration_deepseek.py
    CHANGED
    
    | @@ -82,11 +82,6 @@ class DeepseekV3Config(PretrainedConfig): | |
| 82 | 
             
                        Beginning of stream token id.
         | 
| 83 | 
             
                    eos_token_id (`int`, *optional*, defaults to 2):
         | 
| 84 | 
             
                        End of stream token id.
         | 
| 85 | 
            -
                    pretraining_tp (`int`, *optional*, defaults to 1):
         | 
| 86 | 
            -
                        Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
         | 
| 87 | 
            -
                        document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
         | 
| 88 | 
            -
                        necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
         | 
| 89 | 
            -
                        issue](https://github.com/pytorch/pytorch/issues/76232).
         | 
| 90 | 
             
                    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
         | 
| 91 | 
             
                        Whether to tie weight embeddings
         | 
| 92 | 
             
                    rope_theta (`float`, *optional*, defaults to 10000.0):
         | 
| @@ -141,8 +136,6 @@ class DeepseekV3Config(PretrainedConfig): | |
| 141 | 
             
                    first_k_dense_replace = 3,
         | 
| 142 | 
             
                    norm_topk_prob = True,
         | 
| 143 | 
             
                    scoring_func = 'sigmoid',
         | 
| 144 | 
            -
                    aux_loss_alpha = 0.001,
         | 
| 145 | 
            -
                    seq_aux = True,
         | 
| 146 | 
             
                    hidden_act="silu",
         | 
| 147 | 
             
                    max_position_embeddings=4096,
         | 
| 148 | 
             
                    initializer_range=0.02,
         | 
| @@ -151,7 +144,6 @@ class DeepseekV3Config(PretrainedConfig): | |
| 151 | 
             
                    pad_token_id=None,
         | 
| 152 | 
             
                    bos_token_id=0,
         | 
| 153 | 
             
                    eos_token_id=1,
         | 
| 154 | 
            -
                    pretraining_tp=1,
         | 
| 155 | 
             
                    tie_word_embeddings=False,
         | 
| 156 | 
             
                    rope_theta=10000.0,
         | 
| 157 | 
             
                    rope_scaling=None,
         | 
| @@ -184,8 +176,6 @@ class DeepseekV3Config(PretrainedConfig): | |
| 184 | 
             
                    self.first_k_dense_replace = first_k_dense_replace
         | 
| 185 | 
             
                    self.norm_topk_prob = norm_topk_prob
         | 
| 186 | 
             
                    self.scoring_func = scoring_func
         | 
| 187 | 
            -
                    self.aux_loss_alpha = aux_loss_alpha
         | 
| 188 | 
            -
                    self.seq_aux = seq_aux
         | 
| 189 | 
             
                    # for backward compatibility
         | 
| 190 | 
             
                    if num_key_value_heads is None:
         | 
| 191 | 
             
                        num_key_value_heads = num_attention_heads
         | 
| @@ -194,7 +184,6 @@ class DeepseekV3Config(PretrainedConfig): | |
| 194 | 
             
                    self.hidden_act = hidden_act
         | 
| 195 | 
             
                    self.initializer_range = initializer_range
         | 
| 196 | 
             
                    self.rms_norm_eps = rms_norm_eps
         | 
| 197 | 
            -
                    self.pretraining_tp = pretraining_tp
         | 
| 198 | 
             
                    self.use_cache = use_cache
         | 
| 199 | 
             
                    self.rope_theta = rope_theta
         | 
| 200 | 
             
                    self.rope_scaling = rope_scaling
         | 
|  | |
| 82 | 
             
                        Beginning of stream token id.
         | 
| 83 | 
             
                    eos_token_id (`int`, *optional*, defaults to 2):
         | 
| 84 | 
             
                        End of stream token id.
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 85 | 
             
                    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
         | 
| 86 | 
             
                        Whether to tie weight embeddings
         | 
| 87 | 
             
                    rope_theta (`float`, *optional*, defaults to 10000.0):
         | 
|  | |
| 136 | 
             
                    first_k_dense_replace = 3,
         | 
| 137 | 
             
                    norm_topk_prob = True,
         | 
| 138 | 
             
                    scoring_func = 'sigmoid',
         | 
|  | |
|  | |
| 139 | 
             
                    hidden_act="silu",
         | 
| 140 | 
             
                    max_position_embeddings=4096,
         | 
| 141 | 
             
                    initializer_range=0.02,
         | 
|  | |
| 144 | 
             
                    pad_token_id=None,
         | 
| 145 | 
             
                    bos_token_id=0,
         | 
| 146 | 
             
                    eos_token_id=1,
         | 
|  | |
| 147 | 
             
                    tie_word_embeddings=False,
         | 
| 148 | 
             
                    rope_theta=10000.0,
         | 
| 149 | 
             
                    rope_scaling=None,
         | 
|  | |
| 176 | 
             
                    self.first_k_dense_replace = first_k_dense_replace
         | 
| 177 | 
             
                    self.norm_topk_prob = norm_topk_prob
         | 
| 178 | 
             
                    self.scoring_func = scoring_func
         | 
|  | |
|  | |
| 179 | 
             
                    # for backward compatibility
         | 
| 180 | 
             
                    if num_key_value_heads is None:
         | 
| 181 | 
             
                        num_key_value_heads = num_attention_heads
         | 
|  | |
| 184 | 
             
                    self.hidden_act = hidden_act
         | 
| 185 | 
             
                    self.initializer_range = initializer_range
         | 
| 186 | 
             
                    self.rms_norm_eps = rms_norm_eps
         | 
|  | |
| 187 | 
             
                    self.use_cache = use_cache
         | 
| 188 | 
             
                    self.rope_theta = rope_theta
         | 
| 189 | 
             
                    self.rope_scaling = rope_scaling
         | 
    	
        modeling_deepseek.py
    CHANGED
    
    | @@ -398,7 +398,6 @@ class MoEGate(nn.Module): | |
| 398 | 
             
                    self.n_routed_experts = config.n_routed_experts
         | 
| 399 | 
             
                    self.routed_scaling_factor = config.routed_scaling_factor
         | 
| 400 | 
             
                    self.scoring_func = config.scoring_func
         | 
| 401 | 
            -
                    self.seq_aux = config.seq_aux
         | 
| 402 | 
             
                    self.topk_method = config.topk_method
         | 
| 403 | 
             
                    self.n_group = config.n_group
         | 
| 404 | 
             
                    self.topk_group = config.topk_group
         | 
| @@ -455,7 +454,7 @@ class MoEGate(nn.Module): | |
| 455 | 
             
                            )
         | 
| 456 | 
             
                            .reshape(bsz * seq_len, -1)
         | 
| 457 | 
             
                        )  # [n, e]
         | 
| 458 | 
            -
                        tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(),  | 
| 459 | 
             
                        _, topk_idx = torch.topk(
         | 
| 460 | 
             
                            tmp_scores, k=self.top_k, dim=-1, sorted=False
         | 
| 461 | 
             
                        )
         | 
|  | |
| 398 | 
             
                    self.n_routed_experts = config.n_routed_experts
         | 
| 399 | 
             
                    self.routed_scaling_factor = config.routed_scaling_factor
         | 
| 400 | 
             
                    self.scoring_func = config.scoring_func
         | 
|  | |
| 401 | 
             
                    self.topk_method = config.topk_method
         | 
| 402 | 
             
                    self.n_group = config.n_group
         | 
| 403 | 
             
                    self.topk_group = config.topk_group
         | 
|  | |
| 454 | 
             
                            )
         | 
| 455 | 
             
                            .reshape(bsz * seq_len, -1)
         | 
| 456 | 
             
                        )  # [n, e]
         | 
| 457 | 
            +
                        tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf"))  # [n, e]
         | 
| 458 | 
             
                        _, topk_idx = torch.topk(
         | 
| 459 | 
             
                            tmp_scores, k=self.top_k, dim=-1, sorted=False
         | 
| 460 | 
             
                        )
         | 
