added Adagrad (experimenting with it), added 'extended' model size (16 layers instead of 12, experimenting with it)
This commit is contained in:
parent
4d75ee066c
commit
f0c4baeb25
|
@ -247,6 +247,8 @@ class Model:
|
|||
|
||||
if self.size == "double":
|
||||
return 24
|
||||
if self.size == "extended":
|
||||
return 16
|
||||
return 12
|
||||
|
||||
@property
|
||||
|
|
|
@ -65,6 +65,8 @@ def load_engines(training=True):
|
|||
|
||||
params['d_coef'] = params['lr']
|
||||
params['lr'] = 1.0
|
||||
elif cfg.hyperparameters.optimizer.lower() == "adagrad":
|
||||
optimizer_class = ml.Adagrad
|
||||
else:
|
||||
raise ValueError(f'Optimizer specified not implemented: {cfg.hyperparameters.optimizer}')
|
||||
|
||||
|
|
|
@ -342,10 +342,10 @@ def example_usage():
|
|||
'n_tokens': 1024,
|
||||
'd_model': 1024, # 256, # 1024, # 1536
|
||||
'n_heads': 16, # 4, # 16, # 24
|
||||
'n_layers': 12, # 32
|
||||
'n_layers': 16, # 32
|
||||
'n_experts': 1,
|
||||
|
||||
'l_padding': 8,
|
||||
'l_padding': 8 if cfg.fp8.enabled else 0,
|
||||
}
|
||||
"""
|
||||
kwargs = {
|
||||
|
@ -367,6 +367,7 @@ def example_usage():
|
|||
model = AR_NAR(**kwargs).to(device)
|
||||
steps = 500
|
||||
optimizer = ml.Prodigy(model.parameters(), lr=1.0)
|
||||
#optimizer = ml.Adagrad(model.parameters(), lr=1.0e-2)
|
||||
#optimizer = ml.AdamW(model.parameters(), lr=1.0e-4)
|
||||
|
||||
engine = Engine(model=model, optimizer=optimizer)
|
||||
|
|
|
@ -42,10 +42,12 @@ if cfg.bitsandbytes.enabled:
|
|||
Adam = bnb.optim.Adam8bit
|
||||
AdamW = bnb.optim.AdamW8bit
|
||||
SGD = bnb.optim.SGD8bit
|
||||
Adagrad = bnb.optim.Adagrad8bit
|
||||
else:
|
||||
Adam = torch.optim.Adam
|
||||
AdamW = torch.optim.AdamW
|
||||
SGD = torch.optim.SGD
|
||||
Adagrad = torch.optim.Adagrad
|
||||
|
||||
# handles generically converting to a specific tensor type and converting back (implemented solely for bfloat16)
|
||||
@contextmanager
|
||||
|
|
Loading…
Reference in New Issue
Block a user