diff --git a/vall_e/config.py b/vall_e/config.py index 5c47122..cdf3ac6 100755 --- a/vall_e/config.py +++ b/vall_e/config.py @@ -247,6 +247,8 @@ class Model: if self.size == "double": return 24 + if self.size == "extended": + return 16 return 12 @property diff --git a/vall_e/engines/__init__.py b/vall_e/engines/__init__.py index 35e163d..2ae2fef 100755 --- a/vall_e/engines/__init__.py +++ b/vall_e/engines/__init__.py @@ -65,6 +65,8 @@ def load_engines(training=True): params['d_coef'] = params['lr'] params['lr'] = 1.0 + elif cfg.hyperparameters.optimizer.lower() == "adagrad": + optimizer_class = ml.Adagrad else: raise ValueError(f'Optimizer specified not implemented: {cfg.hyperparameters.optimizer}') diff --git a/vall_e/models/ar_nar.py b/vall_e/models/ar_nar.py index 3e4568b..85dda43 100644 --- a/vall_e/models/ar_nar.py +++ b/vall_e/models/ar_nar.py @@ -342,10 +342,10 @@ def example_usage(): 'n_tokens': 1024, 'd_model': 1024, # 256, # 1024, # 1536 'n_heads': 16, # 4, # 16, # 24 - 'n_layers': 12, # 32 + 'n_layers': 16, # 32 'n_experts': 1, - 'l_padding': 8, + 'l_padding': 8 if cfg.fp8.enabled else 0, } """ kwargs = { @@ -367,6 +367,7 @@ def example_usage(): model = AR_NAR(**kwargs).to(device) steps = 500 optimizer = ml.Prodigy(model.parameters(), lr=1.0) + #optimizer = ml.Adagrad(model.parameters(), lr=1.0e-2) #optimizer = ml.AdamW(model.parameters(), lr=1.0e-4) engine = Engine(model=model, optimizer=optimizer) diff --git a/vall_e/utils/wrapper.py b/vall_e/utils/wrapper.py index e22c037..1a8e122 100755 --- a/vall_e/utils/wrapper.py +++ b/vall_e/utils/wrapper.py @@ -42,10 +42,12 @@ if cfg.bitsandbytes.enabled: Adam = bnb.optim.Adam8bit AdamW = bnb.optim.AdamW8bit SGD = bnb.optim.SGD8bit + Adagrad = bnb.optim.Adagrad8bit else: Adam = torch.optim.Adam AdamW = torch.optim.AdamW SGD = torch.optim.SGD + Adagrad = torch.optim.Adagrad # handles generically converting to a specific tensor type and converting back (implemented solely for bfloat16) @contextmanager