added Adagrad (experimenting with it), added 'extended' model size (16 layers instead of 12, experimenting with it)
This commit is contained in:
parent
4d75ee066c
commit
f0c4baeb25
|
@ -247,6 +247,8 @@ class Model:
|
||||||
|
|
||||||
if self.size == "double":
|
if self.size == "double":
|
||||||
return 24
|
return 24
|
||||||
|
if self.size == "extended":
|
||||||
|
return 16
|
||||||
return 12
|
return 12
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -65,6 +65,8 @@ def load_engines(training=True):
|
||||||
|
|
||||||
params['d_coef'] = params['lr']
|
params['d_coef'] = params['lr']
|
||||||
params['lr'] = 1.0
|
params['lr'] = 1.0
|
||||||
|
elif cfg.hyperparameters.optimizer.lower() == "adagrad":
|
||||||
|
optimizer_class = ml.Adagrad
|
||||||
else:
|
else:
|
||||||
raise ValueError(f'Optimizer specified not implemented: {cfg.hyperparameters.optimizer}')
|
raise ValueError(f'Optimizer specified not implemented: {cfg.hyperparameters.optimizer}')
|
||||||
|
|
||||||
|
|
|
@ -342,10 +342,10 @@ def example_usage():
|
||||||
'n_tokens': 1024,
|
'n_tokens': 1024,
|
||||||
'd_model': 1024, # 256, # 1024, # 1536
|
'd_model': 1024, # 256, # 1024, # 1536
|
||||||
'n_heads': 16, # 4, # 16, # 24
|
'n_heads': 16, # 4, # 16, # 24
|
||||||
'n_layers': 12, # 32
|
'n_layers': 16, # 32
|
||||||
'n_experts': 1,
|
'n_experts': 1,
|
||||||
|
|
||||||
'l_padding': 8,
|
'l_padding': 8 if cfg.fp8.enabled else 0,
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
kwargs = {
|
kwargs = {
|
||||||
|
@ -367,6 +367,7 @@ def example_usage():
|
||||||
model = AR_NAR(**kwargs).to(device)
|
model = AR_NAR(**kwargs).to(device)
|
||||||
steps = 500
|
steps = 500
|
||||||
optimizer = ml.Prodigy(model.parameters(), lr=1.0)
|
optimizer = ml.Prodigy(model.parameters(), lr=1.0)
|
||||||
|
#optimizer = ml.Adagrad(model.parameters(), lr=1.0e-2)
|
||||||
#optimizer = ml.AdamW(model.parameters(), lr=1.0e-4)
|
#optimizer = ml.AdamW(model.parameters(), lr=1.0e-4)
|
||||||
|
|
||||||
engine = Engine(model=model, optimizer=optimizer)
|
engine = Engine(model=model, optimizer=optimizer)
|
||||||
|
|
|
@ -42,10 +42,12 @@ if cfg.bitsandbytes.enabled:
|
||||||
Adam = bnb.optim.Adam8bit
|
Adam = bnb.optim.Adam8bit
|
||||||
AdamW = bnb.optim.AdamW8bit
|
AdamW = bnb.optim.AdamW8bit
|
||||||
SGD = bnb.optim.SGD8bit
|
SGD = bnb.optim.SGD8bit
|
||||||
|
Adagrad = bnb.optim.Adagrad8bit
|
||||||
else:
|
else:
|
||||||
Adam = torch.optim.Adam
|
Adam = torch.optim.Adam
|
||||||
AdamW = torch.optim.AdamW
|
AdamW = torch.optim.AdamW
|
||||||
SGD = torch.optim.SGD
|
SGD = torch.optim.SGD
|
||||||
|
Adagrad = torch.optim.Adagrad
|
||||||
|
|
||||||
# handles generically converting to a specific tensor type and converting back (implemented solely for bfloat16)
|
# handles generically converting to a specific tensor type and converting back (implemented solely for bfloat16)
|
||||||
@contextmanager
|
@contextmanager
|
||||||
|
|
Loading…
Reference in New Issue
Block a user