2024-06-04 01:26:27 +00:00
sample_rate : 24_000 # 44_000 for dac
audio_backend : "vocos" # or dac
models :
2024-06-30 16:00:12 +00:00
- name : "ar+nar" # vanity name
size : "full" # model dimensionality
resp_levels : 8 # RVQ levels this model targets
prom_levels : 8 # should always be the above
tasks : 8 # tasks this model can attend to, only tts is supported at the moment
langs : 2 # languages this model supports, semi-unused at the moment
tones : 1 # tones this model supports, currently unused
arch_type : llama # underlying LLM arch to use, currently focusing on llama
training : True # signals this model is to be trained
version : 5 # helps keep backwards compatibility for when I add new things to the model
attention : auto # attention mechanism to use, "auto" for safety
dropout : 0.1 # percentage of the model to disable during training
2024-06-04 01:26:27 +00:00
2024-06-30 16:00:12 +00:00
# factors for split loss values, remove to have a unified loss calculation
2024-06-04 01:26:27 +00:00
loss_factors :
2024-06-30 16:00:12 +00:00
text : 0.1 # text phoneme portion of the sequence
prom : 0.0 # input prompt portion of the sequence
resp : 1.0 # output audio portin of the sequence
# experimental settings
experimental :
hf : False # uses vall_e.models.experimental, a wrapper around a HF model that could technically be used for non-pytorch backends later
interleave : False # interleaves RVQ levels, only works with above for now
audio_embedding_mode : "" # "" | "inclusive" | "exclusive", whether to utilize the audio backend's embeddings with the input embeddings
audio_embedding_sums : False # whether the input embeddings include all prior RVQ levels (sums) or only the current one, further experimentation is needed to see if this matters
2024-06-04 01:26:27 +00:00
hyperparameters :
autotune : False
autotune_params :
start_profile_step : 1
end_profile_step : 50
num_tuning_micro_batch_sizes : 8
batch_size : 16
gradient_accumulation_steps : 4
gradient_clipping : 1.0
warmup_steps : 100
optimizer : Prodigy
learning_rate : 1.0
torch_optimizer : True
scheduler : "" # ScheduleFree
torch_scheduler : True
evaluation :
batch_size : 8
frequency : 5000
size : 8
steps : 500
ar_temperature : 0.95
nar_temperature : 0.25
load_disabled_engines : True
trainer :
#no_logger: True
ddp : False
#check_for_oom: False
iterations : 1_000_000
save_tag : step
save_on_oom : True
save_on_quit : True
save_frequency : 250
export_on_save : True
keep_last_checkpoints : 4
2024-06-04 02:28:49 +00:00
gradient_checkpointing : True
2024-06-04 01:26:27 +00:00
strict_loading : False
2024-06-04 02:28:49 +00:00
#load_state_dict: True
2024-06-04 01:26:27 +00:00
#load_tag: "9500"
#load_states: False
#restart_step_count: True
gc_mode : None # "global_step"
weight_dtype : float32 # float16 or bfloat16
amp : False
backend : deepspeed
deepspeed :
inferencing : True
zero_optimization_level : 0
use_compression_training : False
amp : False
load_webui : False
inference :
backend : deepspeed
normalize : False
weight_dtype : float32 # float16 or bfloat16
amp : False
optimizations :
injects : False
replace : True
linear : False
embedding : False
optimizers : True
bitsandbytes : False
dadaptation : False
bitnet : False
fp8 : False
dataset :
speaker_name_getter : "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
speaker_group_getter : "lambda p: f'{p.parts[-3]}'"
speaker_languages :
ja : [ ]
use_hdf5 : True
use_metadata : True
hdf5_flag : r
validate : True
workers : 2
cache : True
duration_range : [ 3.0 , 5.0 ]
random_utterance : 1.0
max_prompts : 1
prompt_duration : 3.0
max_resps : 1
p_resp_append : 0.25
2024-06-29 14:11:28 +00:00
sample_type : path # path | speaker | group
sample_order : duration # shuffle | duration
sample_max_duration_batch : 0 # used when above = duration, 120 seconds per batch at 12GiB of VRAM works
2024-06-04 01:26:27 +00:00
tasks_list : [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]
training : [ ]
validation : [ ]
noise : [ ]