vall-e/data/config.yaml

141 lines
2.5 KiB
YAML
Raw Normal View History

2023-08-02 21:53:35 +00:00
models:
- name: "ar+nar"
size: "full"
resp_levels: 8
prom_levels: 8
tasks: 8
langs: 2
tones: 1
arch_type: llama
training: True
version: 4
attention: flash_attention_2
dropout: 0.1
loss_factors:
text: 0.1
resp: 1.0
2023-08-02 21:53:35 +00:00
hyperparameters:
autotune: False
autotune_params:
start_profile_step: 1
end_profile_step: 50
num_tuning_micro_batch_sizes: 8
batch_size: 16
gradient_accumulation_steps: 4
gradient_clipping: 1.0
warmup_steps: 100
optimizer: Prodigy
learning_rate: 1.0
torch_optimizer: True
2023-08-02 21:53:35 +00:00
scheduler: "" # ScheduleFree
torch_scheduler: True
2023-08-02 21:53:35 +00:00
evaluation:
batch_size: 8
frequency: 5000
size: 8
2023-08-02 21:53:35 +00:00
steps: 500
ar_temperature: 0.95
nar_temperature: 0.25
load_disabled_engines: True
2023-08-02 21:53:35 +00:00
trainer:
#no_logger: True
ddp: False
#check_for_oom: False
iterations: 1_000_000
2023-08-02 21:53:35 +00:00
save_tag: step
save_on_oom: True
save_on_quit: True
save_frequency: 250
export_on_save: True
keep_last_checkpoints: 4
2023-08-02 21:53:35 +00:00
aggressive_optimizations: False
load_disabled_engines: False
2023-08-02 21:53:35 +00:00
#load_state_dict: True
strict_loading: False
#load_tag: "9500"
#load_states: False
2023-08-02 21:53:35 +00:00
#restart_step_count: True
gc_mode: None # "global_step"
weight_dtype: float32 # float16 or bfloat16
amp: False
2023-08-02 21:53:35 +00:00
2023-08-04 01:26:36 +00:00
backend: deepspeed
deepspeed:
inferencing: True
zero_optimization_level: 0
use_compression_training: False
2023-08-02 22:01:49 +00:00
amp: False
activation_checkpointing: True
load_webui: False
2023-08-04 01:26:36 +00:00
inference:
backend: deepspeed
audio_backend: "dac"
normalize: False
2023-08-04 01:26:36 +00:00
weight_dtype: float32 # float16 or bfloat16
2023-10-07 01:08:28 +00:00
amp: False
optimizations:
injects: False
replace: True
linear: False
embedding: False
optimizers: True
bitsandbytes: False
dadaptation: False
bitnet: False
fp8: False
experimental: True # practically required now it seems
dataset:
speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
speaker_languages:
ja: []
use_hdf5: True
use_metadata: True
hdf5_flag: r
validate: True
workers: 2
cache: True
duration_range: [3.0, 5.0]
random_utterance: 1.0
max_prompts: 1
prompt_duration: 3.0
max_resps: 1
p_resp_append: 0.25
sample_type: path # speaker
tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]
training: []
validation: []
noise: []