|
|
|
@ -13,12 +13,11 @@ dataset:
|
|
|
|
|
workers: 2
|
|
|
|
|
cache: True
|
|
|
|
|
|
|
|
|
|
phones_range: [4, 256]
|
|
|
|
|
duration_range: [1.0, 16.0]
|
|
|
|
|
min_utterances: 32
|
|
|
|
|
phones_range: [4, 512]
|
|
|
|
|
duration_range: [1.0, 32.0]
|
|
|
|
|
|
|
|
|
|
random_utterance: 1.0
|
|
|
|
|
max_prompts: 6
|
|
|
|
|
max_prompts: 3
|
|
|
|
|
prompt_duration: 6.0
|
|
|
|
|
|
|
|
|
|
sample_type: speaker
|
|
|
|
@ -31,27 +30,22 @@ models:
|
|
|
|
|
|
|
|
|
|
_models:
|
|
|
|
|
- name: "ar+nar"
|
|
|
|
|
size: "double"
|
|
|
|
|
size: "full"
|
|
|
|
|
resp_levels: 8
|
|
|
|
|
prom_levels: 8
|
|
|
|
|
tasks: 8
|
|
|
|
|
arch_type: "retnet"
|
|
|
|
|
training: True
|
|
|
|
|
version: 2
|
|
|
|
|
|
|
|
|
|
version: 3
|
|
|
|
|
|
|
|
|
|
hyperparameters:
|
|
|
|
|
batch_size: 8
|
|
|
|
|
gradient_accumulation_steps: 16
|
|
|
|
|
gradient_accumulation_steps: 32
|
|
|
|
|
gradient_clipping: 100
|
|
|
|
|
|
|
|
|
|
# prodigyopt is nicer, but requires even more VRAM
|
|
|
|
|
#optimizer: Prodigy
|
|
|
|
|
#learning_rate: 1.0 # e-4
|
|
|
|
|
|
|
|
|
|
optimizer: AdamW
|
|
|
|
|
learning_rate: 1.0e-4
|
|
|
|
|
optimizer: Prodigy
|
|
|
|
|
torch_optimizer: True
|
|
|
|
|
learning_rate: 0.0625
|
|
|
|
|
|
|
|
|
|
scheduler_type: ""
|
|
|
|
|
#scheduler_type: OneCycle
|
|
|
|
@ -118,8 +112,12 @@ inference:
|
|
|
|
|
use_vocos: True
|
|
|
|
|
normalize: False
|
|
|
|
|
|
|
|
|
|
weight_dtype: bfloat16
|
|
|
|
|
amp: False
|
|
|
|
|
|
|
|
|
|
bitsandbytes:
|
|
|
|
|
enabled: False
|
|
|
|
|
injects: False
|
|
|
|
|
linear: False
|
|
|
|
|
embedding: False
|
|
|
|
|
injects: True
|
|
|
|
|
linear: True
|
|
|
|
|
embedding: True
|
|
|
|
|
|