Taking another stab at a BPE tokenizer

2021-12-30 13:41:24 -07:00 · 2021-12-30 13:41:24 -07:00 · f0c4cd6317
commit f0c4cd6317
parent 9aa06542cd
4 changed files with 71 additions and 18 deletions
--- a/codes/data/audio/gpt_tts_tokenizer.json
+++ b/codes/data/audio/gpt_tts_tokenizer.json
--- a/codes/data/audio/grand_conjoined_dataset.py
+++ b/codes/data/audio/grand_conjoined_dataset.py
@ -235,14 +235,14 @@ if __name__ == '__main__':
    m = None
    for i, b in tqdm(enumerate(dl)):
        for ib in range(batch_sz):
-            #save(b, i, ib, 'paired_audio')
-            #save(b, i, ib, 'paired_audio_conditioning', 0)
-            #save(b, i, ib, 'paired_audio_conditioning', 1)
-            #print(f'Paired file: {b["paired_file"][ib]} text: {b["paired_text"][ib]}')
-            #print(f'Paired text decoded: {decode(b, ib, "paired_text_tokens")}')
-            save(b, i, ib, 'speech_audio')
-            save(b, i, ib, 'speech_audio_conditioning', 0)
-            save(b, i, ib, 'speech_audio_conditioning', 1)
+            save(b, i, ib, 'paired_audio')
+            save(b, i, ib, 'paired_audio_conditioning', 0)
+            save(b, i, ib, 'paired_audio_conditioning', 1)
+            print(f'Paired file: {b["paired_file"][ib]} text: {b["paired_text"][ib]}')
+            print(f'Paired text decoded: {decode(b, ib, "paired_text_tokens")}')
+            #save(b, i, ib, 'speech_audio')
+            #save(b, i, ib, 'speech_audio_conditioning', 0)
+            #save(b, i, ib, 'speech_audio_conditioning', 1)
            #print(f'Text: {b["text_text"][ib]}')
            #print(f'Text decoded: {decode(b, ib, "text_tokens")}')
        if i > 5:
--- a/codes/data/audio/paired_voice_audio_dataset.py
+++ b/codes/data/audio/paired_voice_audio_dataset.py
@ -51,7 +51,7 @@ def load_voxpopuli(filename):

 class CharacterTokenizer:
    def encode(self, txt):
-        return munchify({'ids': text_to_sequence(txt, ['english_cleaners'])})
+        return text_to_sequence(txt, ['english_cleaners'])

    def decode(self, seq):
        return sequence_to_text(seq)
@ -99,7 +99,8 @@ class TextWavLoader(torch.utils.data.Dataset):
            assert self.max_wav_len is not None and self.max_text_len is not None
        self.use_bpe_tokenizer = opt_get(hparams, ['use_bpe_tokenizer'], True)
        if self.use_bpe_tokenizer:
-            self.tokenizer = Tokenizer.from_file(opt_get(hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json'))
+            from data.audio.voice_tokenizer import VoiceBpeTokenizer
+            self.tokenizer = VoiceBpeTokenizer(opt_get(hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json'))
        else:
            self.tokenizer = CharacterTokenizer()

@ -111,7 +112,7 @@ class TextWavLoader(torch.utils.data.Dataset):
        return (text_seq, wav, text, audiopath_and_text[0])

    def get_text(self, text):
-        tokens = self.tokenizer.encode(text.strip().lower()).ids
+        tokens = self.tokenizer.encode(text)
        tokens = torch.IntTensor(tokens)
        if self.use_bpe_tokenizer:
            # Assert if any UNK,start tokens encountered.
@ -226,14 +227,14 @@ if __name__ == '__main__':
        'phase': 'train',
        'n_workers': 0,
        'batch_size': batch_sz,
-        'needs_collate': False,
+        'needs_collate': True,
        'max_wav_length': 255995,
        'max_text_length': 200,
        'sample_rate': 22050,
        'load_conditioning': True,
        'num_conditioning_candidates': 2,
        'conditioning_length': 44000,
-        'use_bpe_tokenizer': False,
+        'use_bpe_tokenizer': True,
    }
    from data import create_dataset, create_dataloader

--- a/codes/data/audio/voice_tokenizer_builder.py
+++ b/codes/data/audio/voice_tokenizer_builder.py
@ -1,6 +1,7 @@
 import re

 import datasets
+import torch
 from tokenizers import Tokenizer
 from tokenizers.models import BPE
 from tokenizers.pre_tokenizers import Whitespace
@ -9,6 +10,25 @@ from tokenizers.trainers import BpeTrainer

 from data.audio.paired_voice_audio_dataset import load_mozilla_cv, load_voxpopuli, load_tsv
 from models.tacotron2.taco_utils import load_filepaths_and_text
+from models.tacotron2.text.cleaners import english_cleaners
+
+
+class VoiceBpeTokenizer:
+    def __init__(self, vocab_file):
+        self.tokenizer = Tokenizer.from_file(vocab_file)
+
+    def encode(self, txt):
+        txt = english_cleaners(txt)
+        txt = remove_extraneous_punctuation(txt)
+        txt = txt.replace(' ', '[SPACE]')
+        return self.tokenizer.encode(txt).ids
+
+    def decode(self, seq):
+        if isinstance(seq, torch.Tensor):
+            seq = seq.cpu().numpy()
+        txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(' ', '')
+        txt = txt.replace('[SPACE]', ' ')
+        return txt


 def build_text_file_from_priors(priors, output):
@ -30,14 +50,33 @@ def build_text_file_from_priors(priors, output):
            out.flush()


+def remove_extraneous_punctuation(word):
+    replacement_punctuation = {
+        '{': '(', '}': ')',
+        '[': '(', ']': ')',
+        '`': '\'', '—': '-',
+        '—': '-', '`': '\'',
+        'ʼ': '\''
+    }
+    replace = re.compile("|".join([re.escape(k) for k in sorted(replacement_punctuation, key=len, reverse=True)]), flags=re.DOTALL)
+    word = replace.sub(lambda x: replacement_punctuation[x.group(0)], word)
+
+    # TODO: some of these are spoken ('@', '%', '+', etc). Integrate them into the cleaners.
+    extraneous = re.compile(r'^[@#%_=\$\^&\*\+\\]$')
+    word = extraneous.sub('', word)
+    return word
+
+
 def train():
    with open('all_texts.txt', 'r', encoding='utf-8') as at:
        ttsd = at.readlines()
    #bcd = datasets.load_dataset('bookcorpus', cache_dir='Z:\\huggingface_datasets\\cache')['train']

-    allowed_characters_re = re.compile(r'^[0-9a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.\'\?—–ʼ]+$')
+    #allowed_characters_re = re.compile(r'^[0-9a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.\'\?—–ʼ]+$')
+    allowed_characters_re = re.compile(r'^[a-z!:;"/, \-\(\)\.\'\?ʼ]+$')
    def preprocess_word(word, report=False):
-        word = word.strip().lower()
+        word = english_cleaners(word)
+        word = remove_extraneous_punctuation(word)
        if not bool(allowed_characters_re.match(word)):
            if report and word:
                print(f"REPORTING: '{word}'")
@ -53,7 +92,7 @@ def train():
        #for i in range(0, len(bcd), batch_size):
        #    yield [preprocess_word(t) for t in bcd[i:i+batch_size]['text']]

-    trainer = BpeTrainer(special_tokens=['[STOP]', '[UNK]'], vocab_size=511, continuing_subword_prefix='$$$')
+    trainer = BpeTrainer(special_tokens=['[STOP]', '[UNK]', '[SPACE]'], vocab_size=255)
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.train_from_iterator(batch_iterator(), trainer, length=len(ttsd))#+len(bcd))
@ -63,6 +102,18 @@ def train():
    tokenizer.save('gpt_tts_tokenizer.json')


+def test():
+    tok = VoiceBpeTokenizer('gpt_tts_tokenizer.json')
+    with open('all_texts.txt', 'r', encoding='utf-8') as at:
+        ttsd = at.readlines()
+        for line in ttsd:
+            line = line.strip()
+            seq = tok.encode(line)
+            out = tok.decode(seq)
+            print(f">>>{line}")
+            print(f"<<<{out}")
+
+
 if __name__ == '__main__':
    '''
    build_text_file_from_priors([('Y:\\bigasr_dataset\\libritts\\train-all.txt', 'libritts'),
@ -73,4 +124,5 @@ if __name__ == '__main__':
                                 ('Y:\\clips\\books2-transcribed.tsv', 'tsv'),
                                 ('Y:\\clips\\podcasts-0-transcribed.tsv', 'tsv')], 'all_texts.txt')
    '''
-    train()
+    #train()
+    test()