2023-08-02 21:53:35 +00:00
|
|
|
# todo: clean this mess up
|
|
|
|
|
|
|
|
from .config import cfg
|
|
|
|
from .data import create_train_val_dataloader
|
|
|
|
from .emb import qnt
|
|
|
|
|
|
|
|
from .utils import setup_logging, to_device, trainer, flatten_dict, do_gc
|
2024-06-04 02:28:49 +00:00
|
|
|
from .data import fold_inputs, unfold_outputs
|
2023-08-02 21:53:35 +00:00
|
|
|
|
|
|
|
import auraloss
|
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
import random
|
|
|
|
import torch
|
|
|
|
import torch.nn.functional as F
|
|
|
|
import traceback
|
|
|
|
|
|
|
|
from collections import defaultdict
|
|
|
|
|
|
|
|
from tqdm import tqdm
|
2024-05-25 22:39:51 +00:00
|
|
|
import argparse
|
2023-08-02 21:53:35 +00:00
|
|
|
|
|
|
|
_logger = logging.getLogger(__name__)
|
2023-08-04 01:26:36 +00:00
|
|
|
|
2024-05-25 22:39:51 +00:00
|
|
|
mel_stft_loss = auraloss.freq.MelSTFTLoss(cfg.sample_rate, device="cpu")
|
|
|
|
|
2023-08-04 01:26:36 +00:00
|
|
|
def train_feeder(engine, batch):
|
2023-09-02 17:23:40 +00:00
|
|
|
with torch.autocast("cuda", dtype=cfg.trainer.dtype, enabled=cfg.trainer.amp):
|
2024-06-04 02:28:49 +00:00
|
|
|
if engine.hyper_config.experimental:
|
2024-06-04 23:40:30 +00:00
|
|
|
batch_size = len(batch["text"])
|
|
|
|
if cfg.model.interleave:
|
|
|
|
quant_levels = None
|
2024-06-05 03:04:40 +00:00
|
|
|
resps_list = [ resp for resp in batch["resps"] ]
|
2024-06-04 23:40:30 +00:00
|
|
|
else:
|
2024-06-05 04:48:51 +00:00
|
|
|
quant_levels = torch.randint(0 if "ar" in cfg.model.capabilities else 1, cfg.model.max_levels, (batch_size,))
|
2024-06-05 03:04:40 +00:00
|
|
|
resps_list = [ [] if l == 0 else resp for l, resp in zip(quant_levels, batch["resps"]) ]
|
2024-06-04 23:40:30 +00:00
|
|
|
|
2024-06-04 02:28:49 +00:00
|
|
|
input_ids, attention_mask = fold_inputs(
|
|
|
|
text_list=batch["text"],
|
|
|
|
prom_list=batch["proms"],
|
2024-06-04 23:40:30 +00:00
|
|
|
resp_list=resps_list,
|
|
|
|
targ_list=batch["resps"],
|
|
|
|
quant_levels=quant_levels,
|
2024-06-04 02:28:49 +00:00
|
|
|
)
|
|
|
|
target_ids, target_attention_mask = fold_inputs(
|
|
|
|
text_list=batch["text"],
|
|
|
|
prom_list=batch["proms"],
|
2024-06-04 23:40:30 +00:00
|
|
|
resp_list=resps_list,
|
|
|
|
targ_list=batch["resps"],
|
|
|
|
quant_levels=quant_levels,
|
2024-06-04 02:28:49 +00:00
|
|
|
ignore_index=-100
|
|
|
|
)
|
|
|
|
engine(
|
|
|
|
input_ids=input_ids,
|
2024-06-04 23:40:30 +00:00
|
|
|
labels=target_ids,
|
2024-06-04 02:28:49 +00:00
|
|
|
)
|
|
|
|
else:
|
|
|
|
engine(
|
|
|
|
text_list=batch["text"],
|
|
|
|
proms_list=[prom[:, :engine._cfg.prom_levels] for prom in batch["proms"]], # reduce the input prompt to the target prom level
|
|
|
|
resps_list=batch["resps"],
|
|
|
|
lang_list=batch["lang"],
|
|
|
|
)
|
2023-08-04 01:26:36 +00:00
|
|
|
|
2023-09-02 01:58:29 +00:00
|
|
|
losses = engine.gather_attribute("loss")
|
|
|
|
stat = engine.gather_attribute("stats")
|
2023-08-04 01:26:36 +00:00
|
|
|
|
2023-09-02 01:58:29 +00:00
|
|
|
loss = torch.stack([*losses.values()]).sum()
|
2023-08-04 01:26:36 +00:00
|
|
|
|
|
|
|
stats = {}
|
|
|
|
stats |= {k: v.item() for k, v in losses.items()}
|
2023-08-05 20:25:41 +00:00
|
|
|
stats |= {k: v.item() for k, v in stat.items()}
|
2023-08-04 01:26:36 +00:00
|
|
|
|
2023-08-28 16:02:45 +00:00
|
|
|
engine.tokens_processed += sum([ text.shape[0] for text in batch["text"] ])
|
|
|
|
engine.tokens_processed += sum([ resps.shape[0] for resps in batch["resps"] ])
|
|
|
|
|
2023-08-04 01:26:36 +00:00
|
|
|
return loss, stats
|
|
|
|
|
|
|
|
@torch.inference_mode()
|
2023-09-22 18:04:17 +00:00
|
|
|
def run_eval(engines, eval_name, dl):
|
2023-08-04 01:26:36 +00:00
|
|
|
stats = defaultdict(list)
|
|
|
|
stats['loss'] = []
|
|
|
|
|
|
|
|
def process( name, batch, resps_list ):
|
2023-08-19 06:16:46 +00:00
|
|
|
for speaker, path, ref, hyp, prom, task in zip(batch["spkr_name"], batch["path"], batch["resps"], resps_list, batch["proms"], batch["task"]):
|
2023-08-04 01:26:36 +00:00
|
|
|
if len(hyp) == 0:
|
|
|
|
continue
|
|
|
|
|
|
|
|
filename = f'{speaker}_{path.parts[-1]}'
|
|
|
|
|
2023-08-19 06:16:46 +00:00
|
|
|
if task != "tts":
|
|
|
|
filename = f"{filename}_{task}"
|
|
|
|
|
2023-08-04 01:26:36 +00:00
|
|
|
# to-do, refine the output dir to be sane-er
|
|
|
|
ref_path = (cfg.log_dir / str(engines.global_step) / "ref" / filename).with_suffix(".wav")
|
|
|
|
hyp_path = (cfg.log_dir / str(engines.global_step) / name / eval_name / filename).with_suffix(".wav")
|
|
|
|
prom_path = (cfg.log_dir / str(engines.global_step) / name / "prom" / filename).with_suffix(".wav")
|
|
|
|
|
|
|
|
hyp_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
ref_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
prom_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
ref_audio, sr = qnt.decode_to_file(ref, ref_path)
|
|
|
|
hyp_audio, sr = qnt.decode_to_file(hyp, hyp_path)
|
|
|
|
prom_audio, sr = qnt.decode_to_file(prom, prom_path)
|
|
|
|
|
|
|
|
# pseudo loss calculation since we don't get the logits during eval
|
|
|
|
min_length = min( ref_audio.shape[-1], hyp_audio.shape[-1] )
|
2023-08-19 02:19:47 +00:00
|
|
|
ref_audio = ref_audio[..., 0:min_length]
|
|
|
|
hyp_audio = hyp_audio[..., 0:min_length]
|
2023-09-16 00:08:44 +00:00
|
|
|
stats['loss'].append(mel_stft_loss(hyp_audio[None, :, :], ref_audio[None, :, :]).item())
|
2023-08-04 01:26:36 +00:00
|
|
|
|
2023-08-17 23:56:37 +00:00
|
|
|
processed = 0
|
2023-08-19 02:19:47 +00:00
|
|
|
while processed < cfg.evaluation.size:
|
|
|
|
batch: dict = to_device(next(iter(dl)), cfg.device)
|
|
|
|
processed += len(batch["text"])
|
2023-08-04 01:26:36 +00:00
|
|
|
|
2024-06-04 02:28:49 +00:00
|
|
|
for name in engines:
|
|
|
|
engine = engines[name]
|
|
|
|
|
|
|
|
if engine.hyper_config.experimental:
|
2024-06-04 23:40:30 +00:00
|
|
|
if cfg.model.interleave:
|
|
|
|
input_ids, attention_mask = fold_inputs(
|
|
|
|
text_list=batch["text"],
|
|
|
|
prom_list=batch["proms"],
|
|
|
|
)
|
|
|
|
output = engine.module.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=cfg.evaluation.steps, eos_token_id=3, do_sample=False)
|
|
|
|
resps_list = unfold_outputs( output )["resp_list"]
|
|
|
|
else:
|
|
|
|
steps = cfg.evaluation.steps
|
|
|
|
resps_list = [ [] for _ in range(len(text_list)) ]
|
|
|
|
for l in range(cfg.model.max_levels):
|
|
|
|
quant_levels = [ [ l ] for _ in range(len(text_list)) ]
|
|
|
|
|
|
|
|
input_ids, attention_mask = fold_inputs(text_list=batch["text"], prom_list=batch["proms"], resp_list=resps_list, quant_levels=quant_levels, experimental=True)
|
|
|
|
min_length = 1
|
|
|
|
for batch in input_ids:
|
2024-06-04 23:50:48 +00:00
|
|
|
min_length = max( min_length, batch.shape[0] + 1 )
|
2024-06-04 23:40:30 +00:00
|
|
|
|
|
|
|
output = model.generate(
|
|
|
|
input_ids=input_ids,
|
|
|
|
attention_mask=attention_mask,
|
|
|
|
min_length=min_length,
|
|
|
|
max_length=min_length+steps*(2 if l > 0 else 1),
|
|
|
|
eos_token_id=3,
|
|
|
|
do_sample=False
|
|
|
|
)
|
|
|
|
|
|
|
|
unfolded = unfold_outputs( output, quant_levels=quant_levels )
|
|
|
|
|
|
|
|
if l == 0:
|
|
|
|
steps = 0
|
|
|
|
|
|
|
|
for batch, resp in enumerate(unfolded["resp_list"]):
|
|
|
|
length = resp.shape[-1]
|
|
|
|
|
|
|
|
# store length
|
|
|
|
if l == 0:
|
|
|
|
steps = max( steps, length )
|
|
|
|
# pad
|
|
|
|
else:
|
|
|
|
resp = resp[:steps]
|
|
|
|
if length < steps:
|
|
|
|
resp = torch.cat([ resp, torch.Tensor([ 0 for _ in range(steps-length) ]).to(resp) ])
|
|
|
|
|
|
|
|
resps_list[batch].append( resp )
|
|
|
|
|
|
|
|
for i, resp in enumerate( resps_list ):
|
|
|
|
resps_list[i] = torch.stack( resp ).t()
|
2024-06-04 02:28:49 +00:00
|
|
|
else:
|
2024-06-05 15:30:04 +00:00
|
|
|
if "ar" in engine.hyper_config.capabilities:
|
|
|
|
resps_list = engine(text_list=batch["text"], proms_list=batch["proms"], lang_list=batch["lang"], max_steps=cfg.evaluation.steps, sampling_temperature=cfg.evaluation.ar_temperature)
|
|
|
|
else:
|
|
|
|
resps_list = [ resp[:, 0] for resp in batch["resps"] ]
|
|
|
|
|
|
|
|
if "nar" in engine.hyper_config.capabilities:
|
|
|
|
resps_list = engine(text_list=batch["text"], proms_list=batch["proms"], lang_list=batch["lang"], resps_list=resps_list, sampling_temperature=cfg.evaluation.nar_temperature)
|
2023-08-04 01:26:36 +00:00
|
|
|
|
|
|
|
process( name, batch, resps_list )
|
|
|
|
|
2023-08-17 23:56:37 +00:00
|
|
|
|
2023-08-04 01:26:36 +00:00
|
|
|
stats = {k: sum(v) / len(v) for k, v in stats.items()}
|
2023-09-13 18:19:11 +00:00
|
|
|
engines_stats = {
|
|
|
|
f'{name}.{eval_name}': stats,
|
|
|
|
"it": engines.global_step,
|
|
|
|
}
|
2023-08-19 01:58:07 +00:00
|
|
|
#engines_stats['epoch'] = iteration * cfg.hyperparameters.gradient_accumulation_steps / len(dl)
|
2023-08-04 01:26:36 +00:00
|
|
|
|
2024-03-01 16:32:35 +00:00
|
|
|
if cfg.trainer.no_logger:
|
|
|
|
tqdm.write(f"Validation Metrics: {json.dumps(engines_stats)}.")
|
|
|
|
else:
|
|
|
|
_logger.info(f"Validation Metrics: {json.dumps(engines_stats)}.")
|
2023-08-04 01:26:36 +00:00
|
|
|
|
2023-08-02 21:53:35 +00:00
|
|
|
|
2023-10-21 14:55:38 +00:00
|
|
|
def train():
|
2024-05-25 22:39:51 +00:00
|
|
|
parser = argparse.ArgumentParser("VALL-E TTS")
|
2024-06-04 03:34:47 +00:00
|
|
|
parser.add_argument("--eval", action="store_true", default=None)
|
2024-06-04 03:35:55 +00:00
|
|
|
args, unknown = parser.parse_known_args()
|
2024-05-25 22:39:51 +00:00
|
|
|
|
2023-08-03 03:57:10 +00:00
|
|
|
setup_logging(cfg.log_dir)
|
|
|
|
|
2023-08-02 21:53:35 +00:00
|
|
|
train_dl, subtrain_dl, val_dl = create_train_val_dataloader()
|
2023-08-04 01:26:36 +00:00
|
|
|
|
2023-08-02 21:53:35 +00:00
|
|
|
def eval_fn(engines):
|
2024-05-25 22:46:52 +00:00
|
|
|
do_gc()
|
|
|
|
engines.eval()
|
|
|
|
# wrapped in a try block because it's sometimes prone to breaking
|
2023-08-02 21:53:35 +00:00
|
|
|
try:
|
2023-09-22 18:04:17 +00:00
|
|
|
run_eval(engines, "subtrain", subtrain_dl)
|
|
|
|
run_eval(engines, "val", val_dl)
|
2023-08-02 21:53:35 +00:00
|
|
|
except Exception as e:
|
|
|
|
print("Error occurred while performing eval:", str(e))
|
|
|
|
print(traceback.format_exc())
|
|
|
|
|
2024-05-25 22:46:52 +00:00
|
|
|
engines.train()
|
2023-08-02 21:53:35 +00:00
|
|
|
qnt.unload_model()
|
|
|
|
do_gc()
|
|
|
|
|
|
|
|
qnt.unload_model()
|
|
|
|
|
2024-05-25 22:39:51 +00:00
|
|
|
if args.eval:
|
|
|
|
return eval_fn(engines=trainer.load_engines())
|
|
|
|
|
2023-10-21 14:55:38 +00:00
|
|
|
"""
|
|
|
|
if cfg.trainer.load_webui:
|
|
|
|
from .webui import start
|
|
|
|
start(lock=False)
|
|
|
|
"""
|
|
|
|
|
2023-08-02 21:53:35 +00:00
|
|
|
trainer.train(
|
|
|
|
train_dl=train_dl,
|
|
|
|
train_feeder=train_feeder,
|
|
|
|
eval_fn=eval_fn,
|
|
|
|
)
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-05-04 16:48:26 +00:00
|
|
|
# to-do: for DDP, spawn multiprocess instead of requiring `torchrun --nnodes=1 --nproc-per-node=4 -m vall_e.train yaml="./data/config.yaml"`
|
2023-10-21 14:55:38 +00:00
|
|
|
train()
|