vall-e/data/config.yaml

148 lines
3.7 KiB
YAML
Raw Normal View History

sample_rate: 24_000 # 44_000 for dac
audio_backend: "vocos" # or dac
models:
2024-06-30 16:00:12 +00:00
- name: "ar+nar" # vanity name
size: "full" # model dimensionality
resp_levels: 8 # RVQ levels this model targets
prom_levels: 8 # should always be the above
tasks: 8 # tasks this model can attend to, only tts is supported at the moment
langs: 2 # languages this model supports, semi-unused at the moment
tones: 1 # tones this model supports, currently unused
arch_type: llama # underlying LLM arch to use, currently focusing on llama
training: True # signals this model is to be trained
version: 5 # helps keep backwards compatibility for when I add new things to the model
attention: auto # attention mechanism to use, "auto" for safety
dropout: 0.1 # percentage of the model to disable during training
2024-06-30 16:00:12 +00:00
# factors for split loss values, remove to have a unified loss calculation
loss_factors:
2024-06-30 16:00:12 +00:00
text: 0.1 # text phoneme portion of the sequence
prom: 0.0 # input prompt portion of the sequence
resp: 1.0 # output audio portin of the sequence
# experimental settings
experimental:
hf: False # uses vall_e.models.experimental, a wrapper around a HF model that could technically be used for non-pytorch backends later
interleave: False # interleaves RVQ levels, only works with above for now
audio_embedding_mode: "" # "" | "inclusive" | "exclusive", whether to utilize the audio backend's embeddings with the input embeddings
audio_embedding_sums: False # whether the input embeddings include all prior RVQ levels (sums) or only the current one, further experimentation is needed to see if this matters
hyperparameters:
autotune: False
autotune_params:
start_profile_step: 1
end_profile_step: 50
num_tuning_micro_batch_sizes: 8
batch_size: 16
gradient_accumulation_steps: 4
gradient_clipping: 1.0
warmup_steps: 100
optimizer: Prodigy
learning_rate: 1.0
torch_optimizer: True
scheduler: "" # ScheduleFree
torch_scheduler: True
evaluation:
batch_size: 8
frequency: 5000
size: 8
steps: 500
ar_temperature: 0.95
nar_temperature: 0.25
load_disabled_engines: True
trainer:
#no_logger: True
ddp: False
#check_for_oom: False
iterations: 1_000_000
save_tag: step
save_on_oom: True
save_on_quit: True
save_frequency: 250
export_on_save: True
keep_last_checkpoints: 4
2024-06-04 02:28:49 +00:00
gradient_checkpointing: True
strict_loading: False
2024-06-04 02:28:49 +00:00
#load_state_dict: True
#load_tag: "9500"
#load_states: False
#restart_step_count: True
gc_mode: None # "global_step"
weight_dtype: float32 # float16 or bfloat16
amp: False
backend: deepspeed
deepspeed:
inferencing: True
zero_optimization_level: 0
use_compression_training: False
amp: False
load_webui: False
inference:
backend: deepspeed
normalize: False
weight_dtype: float32 # float16 or bfloat16
amp: False
optimizations:
injects: False
replace: True
linear: False
embedding: False
optimizers: True
bitsandbytes: False
dadaptation: False
bitnet: False
fp8: False
dataset:
speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
speaker_languages:
ja: []
use_hdf5: True
use_metadata: True
hdf5_flag: r
validate: True
workers: 2
cache: True
duration_range: [3.0, 5.0]
random_utterance: 1.0
max_prompts: 1
prompt_duration: 3.0
max_resps: 1
p_resp_append: 0.25
sample_type: path # path | speaker | group
sample_order: duration # shuffle | duration
sample_max_duration_batch: 0 # used when above = duration, 120 seconds per batch at 12GiB of VRAM works
tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]
training: []
validation: []
noise: []