diff --git a/codes/models/audio/tts/unet_diffusion_tts_flat.py b/codes/models/audio/tts/unet_diffusion_tts_flat.py index 15e7c590..23ea674a 100644 --- a/codes/models/audio/tts/unet_diffusion_tts_flat.py +++ b/codes/models/audio/tts/unet_diffusion_tts_flat.py @@ -91,6 +91,7 @@ class DiffusionTtsFlat(nn.Module): attn_dropout=dropout, use_rmsnorm=True, ff_glu=True, + ff_mult=2, rotary_pos_emb=True, ) )) @@ -104,12 +105,13 @@ class DiffusionTtsFlat(nn.Module): attn_layers=TimestepEmbeddingAttentionLayers( dim=model_channels, timestep_dim=time_embed_dim, - depth=3, + depth=2, heads=num_heads, ff_dropout=dropout, attn_dropout=dropout, use_rmsnorm=True, ff_glu=True, + ff_mult=2, rotary_pos_emb=True, layerdrop_percent=0, ) @@ -130,6 +132,7 @@ class DiffusionTtsFlat(nn.Module): attn_dropout=dropout, use_rmsnorm=True, ff_glu=True, + ff_mult=2, rotary_pos_emb=True, layerdrop_percent=layer_drop, zero_init_branch_output=True, diff --git a/codes/train.py b/codes/train.py index 6418df72..ee85b690 100644 --- a/codes/train.py +++ b/codes/train.py @@ -318,7 +318,7 @@ class Trainer: if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_diffusion_tts9_mel_flat.yml') + parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../experiments/train_gpt_tts_unified.yml') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') args = parser.parse_args() opt = option.parse(args.opt, is_train=True)