added Adagrad (experimenting with it), added 'extended' model size (16 layers instead of 12, experimenting with it)

This commit is contained in:
mrq 2024-04-09 22:04:01 -05:00
parent 4d75ee066c
commit f0c4baeb25
4 changed files with 9 additions and 2 deletions

View File

@ -247,6 +247,8 @@ class Model:
if self.size == "double": if self.size == "double":
return 24 return 24
if self.size == "extended":
return 16
return 12 return 12
@property @property

View File

@ -65,6 +65,8 @@ def load_engines(training=True):
params['d_coef'] = params['lr'] params['d_coef'] = params['lr']
params['lr'] = 1.0 params['lr'] = 1.0
elif cfg.hyperparameters.optimizer.lower() == "adagrad":
optimizer_class = ml.Adagrad
else: else:
raise ValueError(f'Optimizer specified not implemented: {cfg.hyperparameters.optimizer}') raise ValueError(f'Optimizer specified not implemented: {cfg.hyperparameters.optimizer}')

View File

@ -342,10 +342,10 @@ def example_usage():
'n_tokens': 1024, 'n_tokens': 1024,
'd_model': 1024, # 256, # 1024, # 1536 'd_model': 1024, # 256, # 1024, # 1536
'n_heads': 16, # 4, # 16, # 24 'n_heads': 16, # 4, # 16, # 24
'n_layers': 12, # 32 'n_layers': 16, # 32
'n_experts': 1, 'n_experts': 1,
'l_padding': 8, 'l_padding': 8 if cfg.fp8.enabled else 0,
} }
""" """
kwargs = { kwargs = {
@ -367,6 +367,7 @@ def example_usage():
model = AR_NAR(**kwargs).to(device) model = AR_NAR(**kwargs).to(device)
steps = 500 steps = 500
optimizer = ml.Prodigy(model.parameters(), lr=1.0) optimizer = ml.Prodigy(model.parameters(), lr=1.0)
#optimizer = ml.Adagrad(model.parameters(), lr=1.0e-2)
#optimizer = ml.AdamW(model.parameters(), lr=1.0e-4) #optimizer = ml.AdamW(model.parameters(), lr=1.0e-4)
engine = Engine(model=model, optimizer=optimizer) engine = Engine(model=model, optimizer=optimizer)

View File

@ -42,10 +42,12 @@ if cfg.bitsandbytes.enabled:
Adam = bnb.optim.Adam8bit Adam = bnb.optim.Adam8bit
AdamW = bnb.optim.AdamW8bit AdamW = bnb.optim.AdamW8bit
SGD = bnb.optim.SGD8bit SGD = bnb.optim.SGD8bit
Adagrad = bnb.optim.Adagrad8bit
else: else:
Adam = torch.optim.Adam Adam = torch.optim.Adam
AdamW = torch.optim.AdamW AdamW = torch.optim.AdamW
SGD = torch.optim.SGD SGD = torch.optim.SGD
Adagrad = torch.optim.Adagrad
# handles generically converting to a specific tensor type and converting back (implemented solely for bfloat16) # handles generically converting to a specific tensor type and converting back (implemented solely for bfloat16)
@contextmanager @contextmanager