diff --git a/codes/data/audio/paired_voice_audio_dataset.py b/codes/data/audio/paired_voice_audio_dataset.py index dbdf0f98..b7ea203b 100644 --- a/codes/data/audio/paired_voice_audio_dataset.py +++ b/codes/data/audio/paired_voice_audio_dataset.py @@ -86,7 +86,7 @@ class TextWavLoader(torch.utils.data.Dataset): self.needs_collate = opt_get(hparams, ['needs_collate'], True) if not self.needs_collate: assert self.max_wav_len is not None and self.max_text_len is not None - self.tokenizer = Tokenizer.from_file(opt_get(hparams, ['tokenizer_vocab'], '../experiments/custom_lowercase_gptvoice_tokenizer_r2.json')) + self.tokenizer = Tokenizer.from_file(opt_get(hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json')) def get_wav_text_pair(self, audiopath_and_text): # separate filename and text diff --git a/codes/data/audio/voice_tokenizer_builder.py b/codes/data/audio/voice_tokenizer_builder.py index c2dd3edc..813fdf5e 100644 --- a/codes/data/audio/voice_tokenizer_builder.py +++ b/codes/data/audio/voice_tokenizer_builder.py @@ -33,7 +33,7 @@ def build_text_file_from_priors(priors, output): def train(): with open('all_texts.txt', 'r', encoding='utf-8') as at: ttsd = at.readlines() - bcd = datasets.load_dataset('bookcorpus', cache_dir='Z:\\huggingface_datasets\\cache')['train'] + #bcd = datasets.load_dataset('bookcorpus', cache_dir='Z:\\huggingface_datasets\\cache')['train'] allowed_characters_re = re.compile(r'^[0-9a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.\'\?—–ʼ]+$') def preprocess_word(word, report=False): @@ -49,14 +49,14 @@ def train(): for i in range(0, len(ttsd), batch_size): yield [preprocess_word(t, True) for t in ttsd[i:i+batch_size]] - print("Processing bookcorpus.") - for i in range(0, len(bcd), batch_size): - yield [preprocess_word(t) for t in bcd[i:i+batch_size]['text']] + #print("Processing bookcorpus.") + #for i in range(0, len(bcd), batch_size): + # yield [preprocess_word(t) for t in bcd[i:i+batch_size]['text']] - trainer = BpeTrainer(special_tokens=['[STOP]', '[UNK]'], vocab_size=9999, continuing_subword_prefix='$$$') + trainer = BpeTrainer(special_tokens=['[STOP]', '[UNK]'], vocab_size=511, continuing_subword_prefix='$$$') tokenizer = Tokenizer(BPE(unk_token="[UNK]")) tokenizer.pre_tokenizer = Whitespace() - tokenizer.train_from_iterator(batch_iterator(), trainer, length=len(ttsd)+len(bcd)) + tokenizer.train_from_iterator(batch_iterator(), trainer, length=len(ttsd))#+len(bcd)) print(tokenizer.decode(tokenizer.encode("i was traveling throughhadslfghds the woods in 1235375t137{{}}").ids)) diff --git a/codes/models/gpt_voice/unified_voice.py b/codes/models/gpt_voice/unified_voice.py index cc57b847..558a8d58 100644 --- a/codes/models/gpt_voice/unified_voice.py +++ b/codes/models/gpt_voice/unified_voice.py @@ -1,16 +1,10 @@ -import random -from time import time - import torch import torch.nn as nn import torch.nn.functional as F -from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel, GPT2PreTrainedModel -from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions -from transformers.utils.model_parallel_utils import get_device_map, assert_device_map +from transformers import GPT2Model, GPT2Config from models.arch_util import AttentionBlock from models.gpt_voice.gpt_asr_hf import GPT2InferenceModel -from models.gpt_voice.mini_encoder import AudioMiniEncoder from models.tacotron2.text import symbols from trainer.networks import register_model from utils.util import opt_get @@ -47,14 +41,14 @@ class UnifiedGptVoice(nn.Module): - Voice conditioned on text """ - NUMBER_TEXT_TOKENS = 10000 # The number of tokens produced by our bespoke BPE tokenizer. - START_TEXT_TOKEN = 9999 + NUMBER_TEXT_TOKENS = 256 # The number of tokens produced by our bespoke BPE tokenizer. + START_TEXT_TOKEN = 255 STOP_TEXT_TOKEN = 0 NUMBER_MEL_CODES = 8194 START_MEL_TOKEN = 8192 STOP_MEL_TOKEN = 8193 - def __init__(self, layers=8, model_dim=512, heads=8, max_symbols_per_phrase=80, max_mel_tokens=250, max_conditioning_inputs=3, + def __init__(self, layers=8, model_dim=512, heads=8, max_symbols_per_phrase=120, max_mel_tokens=250, max_conditioning_inputs=3, checkpointing=True, mel_length_compression=1024, max_conditioning_length=60): super().__init__() @@ -222,7 +216,7 @@ def register_unified_gpt_voice(opt_net, opt): if __name__ == '__main__': gpt = UnifiedGptVoice(model_dim=256, heads=4) - l = gpt(torch.randn(2, 80, 800), + l = gpt(torch.randn(2, 120, 800), torch.randint(high=len(symbols), size=(2,80)), torch.randint(high=8192, size=(2,250)), torch.tensor([150*256,195*256]))