256-bpe tokenizer
This commit is contained in:
parent
8e26400ce2
commit
52410fd9d9
|
@ -86,7 +86,7 @@ class TextWavLoader(torch.utils.data.Dataset):
|
||||||
self.needs_collate = opt_get(hparams, ['needs_collate'], True)
|
self.needs_collate = opt_get(hparams, ['needs_collate'], True)
|
||||||
if not self.needs_collate:
|
if not self.needs_collate:
|
||||||
assert self.max_wav_len is not None and self.max_text_len is not None
|
assert self.max_wav_len is not None and self.max_text_len is not None
|
||||||
self.tokenizer = Tokenizer.from_file(opt_get(hparams, ['tokenizer_vocab'], '../experiments/custom_lowercase_gptvoice_tokenizer_r2.json'))
|
self.tokenizer = Tokenizer.from_file(opt_get(hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json'))
|
||||||
|
|
||||||
def get_wav_text_pair(self, audiopath_and_text):
|
def get_wav_text_pair(self, audiopath_and_text):
|
||||||
# separate filename and text
|
# separate filename and text
|
||||||
|
|
|
@ -33,7 +33,7 @@ def build_text_file_from_priors(priors, output):
|
||||||
def train():
|
def train():
|
||||||
with open('all_texts.txt', 'r', encoding='utf-8') as at:
|
with open('all_texts.txt', 'r', encoding='utf-8') as at:
|
||||||
ttsd = at.readlines()
|
ttsd = at.readlines()
|
||||||
bcd = datasets.load_dataset('bookcorpus', cache_dir='Z:\\huggingface_datasets\\cache')['train']
|
#bcd = datasets.load_dataset('bookcorpus', cache_dir='Z:\\huggingface_datasets\\cache')['train']
|
||||||
|
|
||||||
allowed_characters_re = re.compile(r'^[0-9a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.\'\?—–ʼ]+$')
|
allowed_characters_re = re.compile(r'^[0-9a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.\'\?—–ʼ]+$')
|
||||||
def preprocess_word(word, report=False):
|
def preprocess_word(word, report=False):
|
||||||
|
@ -49,14 +49,14 @@ def train():
|
||||||
for i in range(0, len(ttsd), batch_size):
|
for i in range(0, len(ttsd), batch_size):
|
||||||
yield [preprocess_word(t, True) for t in ttsd[i:i+batch_size]]
|
yield [preprocess_word(t, True) for t in ttsd[i:i+batch_size]]
|
||||||
|
|
||||||
print("Processing bookcorpus.")
|
#print("Processing bookcorpus.")
|
||||||
for i in range(0, len(bcd), batch_size):
|
#for i in range(0, len(bcd), batch_size):
|
||||||
yield [preprocess_word(t) for t in bcd[i:i+batch_size]['text']]
|
# yield [preprocess_word(t) for t in bcd[i:i+batch_size]['text']]
|
||||||
|
|
||||||
trainer = BpeTrainer(special_tokens=['[STOP]', '[UNK]'], vocab_size=9999, continuing_subword_prefix='$$$')
|
trainer = BpeTrainer(special_tokens=['[STOP]', '[UNK]'], vocab_size=511, continuing_subword_prefix='$$$')
|
||||||
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
||||||
tokenizer.pre_tokenizer = Whitespace()
|
tokenizer.pre_tokenizer = Whitespace()
|
||||||
tokenizer.train_from_iterator(batch_iterator(), trainer, length=len(ttsd)+len(bcd))
|
tokenizer.train_from_iterator(batch_iterator(), trainer, length=len(ttsd))#+len(bcd))
|
||||||
|
|
||||||
print(tokenizer.decode(tokenizer.encode("i was traveling throughhadslfghds the woods in 1235375t137{{}}").ids))
|
print(tokenizer.decode(tokenizer.encode("i was traveling throughhadslfghds the woods in 1235375t137{{}}").ids))
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,10 @@
|
||||||
import random
|
|
||||||
from time import time
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel, GPT2PreTrainedModel
|
from transformers import GPT2Model, GPT2Config
|
||||||
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
|
||||||
from transformers.utils.model_parallel_utils import get_device_map, assert_device_map
|
|
||||||
|
|
||||||
from models.arch_util import AttentionBlock
|
from models.arch_util import AttentionBlock
|
||||||
from models.gpt_voice.gpt_asr_hf import GPT2InferenceModel
|
from models.gpt_voice.gpt_asr_hf import GPT2InferenceModel
|
||||||
from models.gpt_voice.mini_encoder import AudioMiniEncoder
|
|
||||||
from models.tacotron2.text import symbols
|
from models.tacotron2.text import symbols
|
||||||
from trainer.networks import register_model
|
from trainer.networks import register_model
|
||||||
from utils.util import opt_get
|
from utils.util import opt_get
|
||||||
|
@ -47,14 +41,14 @@ class UnifiedGptVoice(nn.Module):
|
||||||
- Voice conditioned on text
|
- Voice conditioned on text
|
||||||
"""
|
"""
|
||||||
|
|
||||||
NUMBER_TEXT_TOKENS = 10000 # The number of tokens produced by our bespoke BPE tokenizer.
|
NUMBER_TEXT_TOKENS = 256 # The number of tokens produced by our bespoke BPE tokenizer.
|
||||||
START_TEXT_TOKEN = 9999
|
START_TEXT_TOKEN = 255
|
||||||
STOP_TEXT_TOKEN = 0
|
STOP_TEXT_TOKEN = 0
|
||||||
NUMBER_MEL_CODES = 8194
|
NUMBER_MEL_CODES = 8194
|
||||||
START_MEL_TOKEN = 8192
|
START_MEL_TOKEN = 8192
|
||||||
STOP_MEL_TOKEN = 8193
|
STOP_MEL_TOKEN = 8193
|
||||||
|
|
||||||
def __init__(self, layers=8, model_dim=512, heads=8, max_symbols_per_phrase=80, max_mel_tokens=250, max_conditioning_inputs=3,
|
def __init__(self, layers=8, model_dim=512, heads=8, max_symbols_per_phrase=120, max_mel_tokens=250, max_conditioning_inputs=3,
|
||||||
checkpointing=True, mel_length_compression=1024, max_conditioning_length=60):
|
checkpointing=True, mel_length_compression=1024, max_conditioning_length=60):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
@ -222,7 +216,7 @@ def register_unified_gpt_voice(opt_net, opt):
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
gpt = UnifiedGptVoice(model_dim=256, heads=4)
|
gpt = UnifiedGptVoice(model_dim=256, heads=4)
|
||||||
l = gpt(torch.randn(2, 80, 800),
|
l = gpt(torch.randn(2, 120, 800),
|
||||||
torch.randint(high=len(symbols), size=(2,80)),
|
torch.randint(high=len(symbols), size=(2,80)),
|
||||||
torch.randint(high=8192, size=(2,250)),
|
torch.randint(high=8192, size=(2,250)),
|
||||||
torch.tensor([150*256,195*256]))
|
torch.tensor([150*256,195*256]))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user