Set activation_checkpoint_lvl to 100 by default
Browse files- configuration_bert.py +4 -3
configuration_bert.py
CHANGED
|
@@ -55,9 +55,10 @@ class JinaBertConfig(PretrainedConfig):
|
|
| 55 |
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
| 56 |
The epsilon used by the layer normalization layers.
|
| 57 |
window_size (`tuple`, *optional*, defaults to `(-1, -1)`): If not the default, use local attention
|
| 58 |
-
activation_checkpoint_lvl (`int`, *optional*, defaults to `
|
| 59 |
If larger than 0, the MLP activation checkpointing level is expected to be 0 for the first
|
| 60 |
-
`activation_checkpoint_lvl` layers.
|
|
|
|
| 61 |
"""
|
| 62 |
|
| 63 |
model_type = "bert"
|
|
@@ -89,7 +90,7 @@ class JinaBertConfig(PretrainedConfig):
|
|
| 89 |
emb_pooler=None,
|
| 90 |
classifier_dropout=None,
|
| 91 |
num_loras=5,
|
| 92 |
-
activation_checkpoint_lvl=
|
| 93 |
**kwargs,
|
| 94 |
):
|
| 95 |
assert 'position_embedding_type' not in kwargs
|
|
|
|
| 55 |
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
| 56 |
The epsilon used by the layer normalization layers.
|
| 57 |
window_size (`tuple`, *optional*, defaults to `(-1, -1)`): If not the default, use local attention
|
| 58 |
+
activation_checkpoint_lvl (`int`, *optional*, defaults to `100`): How many layers to activation-checkpoint.
|
| 59 |
If larger than 0, the MLP activation checkpointing level is expected to be 0 for the first
|
| 60 |
+
`activation_checkpoint_lvl` layers. The activation checkpointing will only come into effect
|
| 61 |
+
after `model.gradient_checkpointing_enable()` is called.
|
| 62 |
"""
|
| 63 |
|
| 64 |
model_type = "bert"
|
|
|
|
| 90 |
emb_pooler=None,
|
| 91 |
classifier_dropout=None,
|
| 92 |
num_loras=5,
|
| 93 |
+
activation_checkpoint_lvl=100,
|
| 94 |
**kwargs,
|
| 95 |
):
|
| 96 |
assert 'position_embedding_type' not in kwargs
|