Spaces:
Runtime error
Runtime error
Commit
·
2133880
1
Parent(s):
52aac07
sharing groups
Browse files- app.py +5 -3
- mem_calc.py +5 -5
- models.py +13 -7
app.py
CHANGED
|
@@ -27,15 +27,17 @@ share_params = col2.checkbox("Share parameters", value=False)
|
|
| 27 |
|
| 28 |
with st.expander("More options"):
|
| 29 |
batch_size = int(st.number_input('Microbatch size (sequences)', min_value=1, step=1, value=1, format="%i"))
|
| 30 |
-
seq_len = int(st.number_input('Sequence length (max. tokens)', min_value=1, step=1, value=1024, format="%i"))
|
| 31 |
precisions_names = ('Full', 'Mixed ("O1")', 'Pure 16-bit')
|
| 32 |
precisions_values = ('O0', 'O1', 'O3')
|
|
|
|
|
|
|
| 33 |
precision = st.selectbox('Precision', precisions_names, index=1)
|
| 34 |
|
| 35 |
args = mem_calc.parse_args(f"""
|
| 36 |
--model {model} --optimizer {optimizers_values[optimizers_names.index(optimizer)]}
|
| 37 |
-
{'--checkpoint' if checkpoint else ''} {'--offload' if offload else ''}
|
| 38 |
-
--fp16-level {precisions_values[precisions_names.index(precision)]} --bsz {batch_size}
|
|
|
|
| 39 |
""".split())
|
| 40 |
|
| 41 |
|
|
|
|
| 27 |
|
| 28 |
with st.expander("More options"):
|
| 29 |
batch_size = int(st.number_input('Microbatch size (sequences)', min_value=1, step=1, value=1, format="%i"))
|
|
|
|
| 30 |
precisions_names = ('Full', 'Mixed ("O1")', 'Pure 16-bit')
|
| 31 |
precisions_values = ('O0', 'O1', 'O3')
|
| 32 |
+
sharing_groups = int(st.number_input('Shared parameter groups (used if Share parameters is checked)',
|
| 33 |
+
min_value=1, step=1, value=1, format="%i"))
|
| 34 |
precision = st.selectbox('Precision', precisions_names, index=1)
|
| 35 |
|
| 36 |
args = mem_calc.parse_args(f"""
|
| 37 |
--model {model} --optimizer {optimizers_values[optimizers_names.index(optimizer)]}
|
| 38 |
+
{'--checkpoint' if checkpoint else ''} {'--offload' if offload else ''}
|
| 39 |
+
--fp16-level {precisions_values[precisions_names.index(precision)]} --bsz {batch_size}
|
| 40 |
+
{f'--shared_groups {sharing_groups}' if share_params else ''}
|
| 41 |
""".split())
|
| 42 |
|
| 43 |
|
mem_calc.py
CHANGED
|
@@ -24,7 +24,7 @@ def vocab(bsz, seqlen, dmodel, vocab_dim):
|
|
| 24 |
|
| 25 |
|
| 26 |
def transformer(bsz, seqlen, dmodel, nlayers, vocab_type, dhid=None,
|
| 27 |
-
checkpoint=False,
|
| 28 |
if dhid is None: dhid = 4*dmodel
|
| 29 |
model = 0
|
| 30 |
grad = 0
|
|
@@ -33,8 +33,8 @@ def transformer(bsz, seqlen, dmodel, nlayers, vocab_type, dhid=None,
|
|
| 33 |
model += m
|
| 34 |
grad += g
|
| 35 |
|
| 36 |
-
if
|
| 37 |
-
model = model / nlayers
|
| 38 |
|
| 39 |
m, g = vocab(bsz, seqlen, dmodel, vocab_type)
|
| 40 |
model += m
|
|
@@ -128,7 +128,7 @@ def parse_args(args=None):
|
|
| 128 |
parser.add_argument('--ngpus', type=int, default=1, help='The number of gpus. Default: 1')
|
| 129 |
parser.add_argument('--zero', type=int, default=0,
|
| 130 |
help='The ZeRO level (1 optimizer, 2 optimizer+weights, 3 everything. Default: 1')
|
| 131 |
-
parser.add_argument('--
|
| 132 |
parser.add_argument('--checkpoint', action='store_true', help='Use gradient checkpointing.')
|
| 133 |
|
| 134 |
return parser.parse_args(args)
|
|
@@ -143,7 +143,7 @@ def calculate_memory(args):
|
|
| 143 |
if getattr(args, key, None) is None:
|
| 144 |
setattr(args, key, value)
|
| 145 |
|
| 146 |
-
model, grad = transformer(args.bsz, args.seqlen, args.dmodel, args.nlayers, args.vocab_size, args.dhid, args.checkpoint, args.
|
| 147 |
parameters = model
|
| 148 |
|
| 149 |
if args.optimizer == 'adam':
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
def transformer(bsz, seqlen, dmodel, nlayers, vocab_type, dhid=None,
|
| 27 |
+
checkpoint=False, shared_groups=None):
|
| 28 |
if dhid is None: dhid = 4*dmodel
|
| 29 |
model = 0
|
| 30 |
grad = 0
|
|
|
|
| 33 |
model += m
|
| 34 |
grad += g
|
| 35 |
|
| 36 |
+
if shared_groups is not None:
|
| 37 |
+
model = model / nlayers * shared_groups
|
| 38 |
|
| 39 |
m, g = vocab(bsz, seqlen, dmodel, vocab_type)
|
| 40 |
model += m
|
|
|
|
| 128 |
parser.add_argument('--ngpus', type=int, default=1, help='The number of gpus. Default: 1')
|
| 129 |
parser.add_argument('--zero', type=int, default=0,
|
| 130 |
help='The ZeRO level (1 optimizer, 2 optimizer+weights, 3 everything. Default: 1')
|
| 131 |
+
parser.add_argument('--shared_groups', type=int, default=None, help='Number of shared layer groups (as in ALBERT). Defaults to no sharing.')
|
| 132 |
parser.add_argument('--checkpoint', action='store_true', help='Use gradient checkpointing.')
|
| 133 |
|
| 134 |
return parser.parse_args(args)
|
|
|
|
| 143 |
if getattr(args, key, None) is None:
|
| 144 |
setattr(args, key, value)
|
| 145 |
|
| 146 |
+
model, grad = transformer(args.bsz, args.seqlen, args.dmodel, args.nlayers, args.vocab_size, args.dhid, args.checkpoint, args.shared_groups)
|
| 147 |
parameters = model
|
| 148 |
|
| 149 |
if args.optimizer == 'adam':
|
models.py
CHANGED
|
@@ -56,13 +56,6 @@ models['gpt2-xl']['dhid'] = 1600*4
|
|
| 56 |
models['gpt2-xl']['nlayers'] = 48
|
| 57 |
models['gpt2-xl']['vocab_size'] = 50257
|
| 58 |
|
| 59 |
-
models['gpt-j-6b'] = {}
|
| 60 |
-
models['gpt-j-6b']['seqlen'] = 2048
|
| 61 |
-
models['gpt-j-6b']['dmodel'] = 4096
|
| 62 |
-
models['gpt-j-6b']['dhid'] = 4096 * 4
|
| 63 |
-
models['gpt-j-6b']['nlayers'] = 28
|
| 64 |
-
models['gpt-j-6b']['vocab_size'] = 50400
|
| 65 |
-
|
| 66 |
models['gpt3-s'] = {}
|
| 67 |
models['gpt3-s']['seqlen'] = 2048
|
| 68 |
models['gpt3-s']['dmodel'] = 768
|
|
@@ -118,3 +111,16 @@ models['gpt3-175b']['dmodel'] = 12288
|
|
| 118 |
models['gpt3-175b']['dhid'] = 12288*4
|
| 119 |
models['gpt3-175b']['nlayers'] = 96
|
| 120 |
models['gpt3-175b']['vocab_size'] = 50257 # from public reimplementations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
models['gpt2-xl']['nlayers'] = 48
|
| 57 |
models['gpt2-xl']['vocab_size'] = 50257
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
models['gpt3-s'] = {}
|
| 60 |
models['gpt3-s']['seqlen'] = 2048
|
| 61 |
models['gpt3-s']['dmodel'] = 768
|
|
|
|
| 111 |
models['gpt3-175b']['dhid'] = 12288*4
|
| 112 |
models['gpt3-175b']['nlayers'] = 96
|
| 113 |
models['gpt3-175b']['vocab_size'] = 50257 # from public reimplementations
|
| 114 |
+
|
| 115 |
+
models['gpt-j-6b'] = {}
|
| 116 |
+
models['gpt-j-6b']['seqlen'] = 2048
|
| 117 |
+
models['gpt-j-6b']['dmodel'] = 4096
|
| 118 |
+
models['gpt-j-6b']['dhid'] = 4096 * 4
|
| 119 |
+
models['gpt-j-6b']['nlayers'] = 28
|
| 120 |
+
models['gpt-j-6b']['vocab_size'] = 50400
|
| 121 |
+
|
| 122 |
+
models['dalle-12b'] = {}
|
| 123 |
+
models['dalle-12b']['seqlen'] = 1024 + 256
|
| 124 |
+
models['dalle-12b']['dmodel'] = 62 * 64
|
| 125 |
+
models['dalle-12b']['nlayers'] = 64
|
| 126 |
+
models['dalle-12b']['vocab_size'] = 8192 + 16384
|