Uploading patch
Browse files- modeling_gpt_bert.py +3 -2
modeling_gpt_bert.py
CHANGED
|
@@ -310,6 +310,7 @@ class Embedding(nn.Module):
|
|
| 310 |
class GPTBERTPreTrainedModel(PreTrainedModel):
|
| 311 |
config_class = ModelConfig
|
| 312 |
supports_gradient_checkpointing = False
|
|
|
|
| 313 |
|
| 314 |
def _set_gradient_checkpointing(self, module, value=False):
|
| 315 |
raise NotImplementedError("Gradient checkpointing is not supported by this model")
|
|
@@ -330,7 +331,7 @@ class GPTBERTPreTrainedModel(PreTrainedModel):
|
|
| 330 |
|
| 331 |
class GPTBERT(GPTBERTPreTrainedModel):
|
| 332 |
|
| 333 |
-
def __init__(self, config: ModelConfig, is_causal: bool, **kwargs):
|
| 334 |
super().__init__(config, **kwargs)
|
| 335 |
self.config = config
|
| 336 |
self.hidden_size = config.hidden_size
|
|
@@ -380,7 +381,7 @@ class GPTBERT(GPTBERTPreTrainedModel):
|
|
| 380 |
attention, layer_attention_probs = attention_layer(contextualized_embeddings[-1], attention_mask, relative_embeddings)
|
| 381 |
layer_embeddings = contextualized_embeddings[-1] + attention
|
| 382 |
layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2)
|
| 383 |
-
layer_embeddings
|
| 384 |
layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2 + 1)
|
| 385 |
contextualized_embeddings.append(layer_embeddings)
|
| 386 |
attention_probs.append(layer_attention_probs)
|
|
|
|
| 310 |
class GPTBERTPreTrainedModel(PreTrainedModel):
|
| 311 |
config_class = ModelConfig
|
| 312 |
supports_gradient_checkpointing = False
|
| 313 |
+
base_model_prefix = "model"
|
| 314 |
|
| 315 |
def _set_gradient_checkpointing(self, module, value=False):
|
| 316 |
raise NotImplementedError("Gradient checkpointing is not supported by this model")
|
|
|
|
| 331 |
|
| 332 |
class GPTBERT(GPTBERTPreTrainedModel):
|
| 333 |
|
| 334 |
+
def __init__(self, config: ModelConfig, is_causal: bool = False, **kwargs):
|
| 335 |
super().__init__(config, **kwargs)
|
| 336 |
self.config = config
|
| 337 |
self.hidden_size = config.hidden_size
|
|
|
|
| 381 |
attention, layer_attention_probs = attention_layer(contextualized_embeddings[-1], attention_mask, relative_embeddings)
|
| 382 |
layer_embeddings = contextualized_embeddings[-1] + attention
|
| 383 |
layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2)
|
| 384 |
+
layer_embeddings = layer_embeddings + mlp_layer(layer_embeddings)
|
| 385 |
layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2 + 1)
|
| 386 |
contextualized_embeddings.append(layer_embeddings)
|
| 387 |
attention_probs.append(layer_attention_probs)
|