From f7f6d3bf6d4be31c16106c3ca333de1a4be27e26 Mon Sep 17 00:00:00 2001 From: mrq Date: Sat, 19 Aug 2023 09:50:07 -0500 Subject: [PATCH] validated that SpeechX tasks cse and nse works, added a method to test each task by invoking `python3 -m vall_e.data --action=tasks --tasks='sr,se,cse,nse'` --- README.md | 26 ++------ data/config.yaml | 2 +- vall_e/config.py | 20 ++++-- vall_e/data.py | 170 ++++++++++++++++++----------------------------- 4 files changed, 84 insertions(+), 134 deletions(-) diff --git a/README.md b/README.md index cf409fe..a25ec5b 100755 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ If you're wanting to increase the `prom_levels` for a given model, or increase t 4. Customize your configuration and define the dataset by modifying `./data/config.yaml`. Refer to `./vall_e/config.py` for details. If you want to choose between different model presets, check `./vall_e/models/__init__.py`. -If you're interested in creating an HDF5 copy of your dataset, simply invoke: `python -m vall_e.data --create-hdf5 yaml='./data/config.yaml'` +If you're interested in creating an HDF5 copy of your dataset, simply invoke: `python -m vall_e.data --action='hdf5' yaml='./data/config.yaml'` 5. Train the AR and NAR models using the following scripts: `python -m vall_e.train yaml=./data/config.yaml` @@ -81,32 +81,18 @@ Two dataset formats are supported: ## Export -Both trained models *can* be exported, but is only required if loading them on systems without DeepSpeed for inferencing (Windows systems). To export the models, run: +Both trained models *can* be exported, but is only required if loading them on systems without DeepSpeed for inferencing (Windows systems). To export the models, run: `python -m vall_e.export yaml=./data/config.yaml`. -``` -python -m vall_e.export yaml=./data/config.yaml -``` - -This will export the latest checkpoints. +This will export the latest checkpoints under `./data/ckpt/ar-retnet-2/fp32.pth` and `./data/ckpt/nar-retnet-2/fp32.pth` to be loaded on any system with PyTorch. ## Synthesis -To synthesize speech, invoke either (if exported the models): - -``` -python -m vall_e --ar-ckpt ./models/ar.pt --nar-ckpt ./models/nar.pt -``` - -or: - -``` -python -m vall_e yaml= -``` +To synthesize speech, invoke either (if exported the models): `python -m vall_e --ar-ckpt ./models/ar.pt --nar-ckpt ./models/nar.pt` or `python -m vall_e yaml=` Some additional flags you can pass are: * `--max-ar-steps`: maximum steps for inferencing through the AR model. Each second is 75 steps. -* `--ar-temp`: sampling temperature to use for the AR pass. -* `--nar-temp`: sampling temperature to use for the NAR pass. +* `--ar-temp`: sampling temperature to use for the AR pass. During experimentation, `0.95` provides the most consistent output. +* `--nar-temp`: sampling temperature to use for the NAR pass. During experimentation, `0.2` provides the most clean output. * `--device`: device to use (default: `cuda`, examples: `cuda:0`, `cuda:1`, `cpu`) ## To-Do diff --git a/data/config.yaml b/data/config.yaml index 65ec089..3c70cd0 100755 --- a/data/config.yaml +++ b/data/config.yaml @@ -72,7 +72,7 @@ evaluation: steps: 300 ar_temperature: 1.0 - nar_temperature: 1.0 + nar_temperature: 0.2 trainer: iterations: 1_000_000 diff --git a/vall_e/config.py b/vall_e/config.py index 2057007..b3039e3 100755 --- a/vall_e/config.py +++ b/vall_e/config.py @@ -446,6 +446,18 @@ class Config(_Config): tmp = Config.from_yaml( config_path ) self.__dict__.update(tmp.__dict__) + def load_hdf5( self, write=False ): + if hasattr(self, 'hdf5'): + self.hdf5.close() + + if self.distributed: + self.dataset.hdf5_flag = "r" + try: + self.hdf5 = h5py.File(f'{self.cfg_path}/{self.dataset.hdf5_name}', 'a' if write else self.dataset.hdf5_flag) # to-do, have an easy to set flag that determines if training or creating the dataset + except Exception as e: + print("Error while opening HDF5 file:", f'{self.cfg_path}/{self.dataset.hdf5_name}', str(e)) + self.dataset.use_hdf5 = False + def format( self ): self.dataset = Dataset(**self.dataset) self.models = Models(**self.models) @@ -466,13 +478,7 @@ try: # cached_property stopped working... if cfg.dataset.use_hdf5: - if cfg.distributed: - cfg.dataset.hdf5_flag = "r" - try: - cfg.hdf5 = h5py.File(f'{cfg.cfg_path}/{cfg.dataset.hdf5_name}', cfg.dataset.hdf5_flag) # to-do, have an easy to set flag that determines if training or creating the dataset - except Exception as e: - print("Error while opening HDF5 file:", f'{cfg.cfg_path}/{cfg.dataset.hdf5_name}', str(e)) - cfg.dataset.use_hdf5 = False + cfg.load_hdf5() if not cfg.dataset.use_hdf5: cfg.dataset.training = [ Path(dir) for dir in cfg.dataset.training ] diff --git a/vall_e/data.py b/vall_e/data.py index 241c90e..9d97207 100755 --- a/vall_e/data.py +++ b/vall_e/data.py @@ -300,6 +300,11 @@ class Dataset(_Dataset): task = random.choice(self.tasks) + # ensure a speaker has at least four utterances + # default to tts if not + if len(set(self.paths_by_spkr_name[spkr_name]) - {path}) < 4: + task = "tts" + noise_scale = 0.125 # text-to-speech if task == "tts": @@ -349,7 +354,7 @@ class Dataset(_Dataset): # clean speech editing elif task == "cse" or task == "nse": choices = set(self.paths_by_spkr_name[spkr_name]) - {path} - sampled = random.choice([*choices], 4) + sampled = random.sample([*choices], 4) if cfg.dataset.use_hdf5: texts = [ torch.from_numpy(cfg.hdf5[_get_hdf5_path(path)]["text"][:]).to(self.text_dtype) for path in sampled ] @@ -359,8 +364,8 @@ class Dataset(_Dataset): qnts = [ _load_quants(path) for path in sampled ] # remove - for text in texts: - text = text[1:-1] + for i in range(len(texts)): + texts[i] = texts[i][1:-1] pre_text, mid_text, post_text, edit_text = texts pre_prom, mid_prom, post_prom, edit_prom = qnts @@ -376,11 +381,11 @@ class Dataset(_Dataset): # create new text text = torch.cat( - [ 1 ] + # - ([ pre_text ] if pre_text is not None else []) + - [ edit_text ] + - ([ post_post ] if post_post is not None else []) + - [ 2 ] # + [ torch.Tensor( [ 1 ] ).to(dtype=self.text_dtype) ] + # + ([ pre_text, torch.Tensor( [ 3 ] ).to(dtype=self.text_dtype) ] if pre_text is not None else []) + # pre_text + space' + [ edit_text ] + # 'edit text' + ([ torch.Tensor( [ 3 ] ).to(dtype=self.text_dtype), post_text ] if post_text is not None else []) + # 'space' + edit_text + [ torch.Tensor( [ 2 ] ).to(dtype=self.text_dtype) ] # ) if task == "nse": @@ -397,7 +402,7 @@ class Dataset(_Dataset): # extend the noise to fill the target audio n = repeat_extend_audio(noise, proms.shape[0]) # merge the noise over the utterance - return merge_audio(proms, noise, scale=[1, noise_scale], device="cpu") + return merge_audio(proms, n, scale=[1, noise_scale], device="cpu") # apply noise to all pieces pre_prom = noise_proms( pre_prom ) @@ -649,14 +654,17 @@ def create_train_val_dataloader(): return train_dl, subtrain_dl, val_dl -# parse yaml to create an hdf5 tile +# parse yaml to create an hdf5 file def create_dataset_hdf5(): + cfg.dataset.use_hdf5 = True + cfg.load_hdf5(write=True) + symmap = get_phone_symmap() root = cfg.cfg_path hf = cfg.hdf5 - def add( dir, type="training" ): + def add( dir, type="training", audios=True, texts=True ): dir = "./" + str(dir) name = dir.replace(root, "") @@ -670,7 +678,10 @@ def create_dataset_hdf5(): # grab IDs for every file ids = { ".".join(file.split(".")[:-2]) for file in files } for id in tqdm(ids, desc=f"Processing {name}"): - if not os.path.exists(f'{root}/{name}/{id}.qnt.pt') or not os.path.exists(f'{root}/{name}/{id}.phn.txt'): + audio_exists = os.path.exists(f'{root}/{name}/{id}.qnt.pt') if audios else True + text_exists = os.path.exists(f'{root}/{name}/{id}.phn.txt') if texts else True + + if not audio_exists or not text_exists: continue key = f'{type}/{name}/{id}' @@ -681,27 +692,29 @@ def create_dataset_hdf5(): group = hf.create_group(key) # audio - qnt = torch.load(f'{root}/{name}/{id}.qnt.pt')[0].t() - group.create_dataset('audio', data=qnt.numpy(), compression='lzf') + if audios: + qnt = torch.load(f'{root}/{name}/{id}.qnt.pt')[0].t() + group.create_dataset('audio', data=qnt.numpy(), compression='lzf') # text - with open(f'{root}/{name}/{id}.phn.txt', "r", encoding="utf8") as f: - content = f.read() - split = content.split(" ") - phones = [f""] + [ " " if not p else p for p in split ] + [f""] - for s in set(phones): - if s not in symmap: - symmap[s] = len(symmap.keys()) - phn = [ symmap[s] for s in phones ] + if texts: + with open(f'{root}/{name}/{id}.phn.txt', "r", encoding="utf8") as f: + content = f.read() + split = content.split(" ") + phones = [f""] + [ " " if not p else p for p in split ] + [f""] + for s in set(phones): + if s not in symmap: + symmap[s] = len(symmap.keys()) + phn = [ symmap[s] for s in phones ] - group.create_dataset('text', data=phn, compression='lzf', chunks=True) + group.create_dataset('text', data=phn, compression='lzf', chunks=True) - # metadata - group.attrs['id'] = id - group.attrs['type'] = type - group.attrs['speaker'] = name - group.attrs['duration'] = qnt.shape[0] / 75 - group.attrs['phonemes'] = len(phn) + # metadata + group.attrs['id'] = id + group.attrs['type'] = type + group.attrs['speaker'] = name + group.attrs['duration'] = qnt.shape[0] / 75 + group.attrs['phonemes'] = len(phn) # training for data_dir in tqdm(cfg.dataset.training, desc="Processing Training"): @@ -713,10 +726,13 @@ def create_dataset_hdf5(): # noise for data_dir in tqdm(cfg.dataset.noise, desc='Processing Noise'): - add( data_dir, type="noise" ) + add( data_dir, type="noise", texts=False ) # write symmap - hf.create_dataset('symmap', data=json.dumps(symmap)) + try: + hf.create_dataset('symmap', data=json.dumps(symmap)) + except Exception as e: + pass hf.close() @@ -724,14 +740,15 @@ if __name__ == "__main__": import argparse parser = argparse.ArgumentParser("Save trained model to path.") - parser.add_argument("--task", type=str) + parser.add_argument("--action", type=str) + parser.add_argument("--tasks", type=str) args = parser.parse_args() - task = args.task + task = args.action - if args.task == "hdf5": + if args.action == "hdf5": create_dataset_hdf5() - elif args.task == "sample": + elif args.action == "sample": train_dl, subtrain_dl, val_dl = create_train_val_dataloader() samples = { @@ -745,77 +762,18 @@ if __name__ == "__main__": del v[i]['proms'] del v[i]['resps'] print(f'{k}:', v) - """ - elif args.task == "tasks": + elif args.action == "tasks": index = 0 - task = "ns" + cfg.dataset.tasks_list = args.tasks.split(",") + + train_dl, subtrain_dl, val_dl = create_train_val_dataloader() + batch = next(iter(train_dl)) - train_dataset, val_dataset = create_datasets() - train_dataset.task_symmap = get_task_symmap() + for text, resps, proms, task in zip(batch["text"], batch["resps"], batch["proms"], batch["task"]): + if task not in cfg.dataset.tasks_list: + continue - if cfg.dataset.sample_type == "speaker": - spkr_name = train_dataset.spkrs[index] - spkr_id = train_dataset.spkr_symmap[spkr_name] - path = random.choice([*set(train_dataset.paths_by_spkr_name[spkr_name])]) - else: - path = train_dataset.paths[index] - spkr_name = cfg.get_spkr(path) - spkr_id = train_dataset.spkr_symmap[spkr_name] - - if cfg.dataset.use_hdf5: - key = _get_hdf5_path(path) - text = torch.from_numpy(cfg.hdf5[key]["text"][:]).to(train_dataset.text_dtype) - resps = torch.from_numpy(cfg.hdf5[key]["audio"][:, :cfg.models.prom_levels]).to(torch.int16) - else: - text = torch.tensor([*map(train_dataset.phone_symmap.get, _get_phones(path))]).to(train_dataset.text_dtype) - resps = _load_quants(path) - - noise = None - if task == "ns" or task == "sr": - # sample random noise - noise = train_dataset.sample_noise() - - decode_to_file( noise, "./.noise.wav", device="cpu" ) - - # extend the noise to fill the target audio - noise = repeat_extend_audio(noise, resps.shape[0]) - # create the input prompt by merging the target audio with the noise - proms = merge_audio(resps, noise, scale=[1, 0.125]) - # set the target to just be the noise if - if task == "sr": - resps = noise - # prepend the task token - proms = torch.cat( [train_dataset.get_task_token(task), proms] ) - - # set the text prompt to empty to train without a guided text prompt - if random.random() < 0.5: - text = torch.tensor([1, 2]).to(train_dataset.text_dtype) - # target speech extraction - elif task == "tse": - # sample a random, clean, utterance for the target speaker - clean_proms = train_dataset.sample_prompts(spkr_name, ignore=path) if random.random() < cfg.dataset.random_utterance else resps - # sample a random, clean utterance from a different speaker - other_proms = train_dataset.sample_prompts(train_dataset.sample_speakers(ignore=[spkr_name]), ignore="") - # overlay the random speaker over the target audio - - smallest_size = min(resps.shape[0], other_proms.shape[0]) - if other_proms.shape[0] == smallest_size: - noisy_proms = merge_audio( resps[:smallest_size, :], other_proms ) - noisy_proms = torch.cat( [ noisy_proms, resps[smallest_size:, :] ] ) - else: - noisy_proms = merge_audio( resps, other_proms[:smallest_size, :] ) - noisy_proms = torch.cat( [ noisy_proms, other_proms[smallest_size:, :] ] ) - - # stitch together the promps - proms = torch.cat( [clean_proms, train_dataset.get_task_token(task), noisy_proms] ) - - # set the text prompt to empty to train without a guided text prompt - if random.random() < 0.5: - text = torch.tensor([1, 2]).to(train_dataset.text_dtype) - - decode_to_file( proms, "./.proms.wav", device="cpu" ) - decode_to_file( resps, "./.resps.wav", device="cpu" ) - - if noise is not None: - decode_to_file( noise, "./.noise-fill.wav", device="cpu" ) - """ \ No newline at end of file + print(text, task) + decode_to_file( proms, f"./.{task}.proms.wav", device="cpu" ) + decode_to_file( resps, f"./.{task}.resps.wav", device="cpu" ) + break \ No newline at end of file