Another fix

2021-12-22 18:30:50 -07:00 · 2021-12-22 18:30:50 -07:00 · 191e0130ee
commit 191e0130ee
parent 6c6daa5795
2 changed files with 4 additions and 6 deletions
--- a/codes/data/audio/paired_voice_audio_dataset.py
+++ b/codes/data/audio/paired_voice_audio_dataset.py
@ -221,8 +221,8 @@ if __name__ == '__main__':
    batch_sz = 8
    params = {
        'mode': 'paired_voice_audio',
-        'path': ['Z:\\bigasr_dataset\\libritts\\test-clean_list.txt'],
+        'path': ['Z:\\clips\\podcasts-0-transcribed.tsv'],
-        'fetcher_mode': ['libritts'],
+        'fetcher_mode': ['tsv'],
        'phase': 'train',
        'n_workers': 0,
        'batch_size': batch_sz,
@ -230,9 +230,7 @@ if __name__ == '__main__':
        'max_wav_length': 255995,
        'max_text_length': 200,
        'sample_rate': 22050,
-        'load_conditioning': True,
+        'load_conditioning': False,
        'num_conditioning_candidates': 3,
        'conditioning_length': 44100,
    }
    from data import create_dataset, create_dataloader
--- a/codes/data/audio/voice_tokenizer_builder.py
+++ b/codes/data/audio/voice_tokenizer_builder.py
@ -35,7 +35,7 @@ def train():
    bcd = datasets.load_dataset('bookcorpus', cache_dir='Z:\\huggingface_datasets\\cache')['train']
    wkd = datasets.load_dataset('wikipedia', '20200501.en', cache_dir='Z:\\huggingface_datasets\\cache')['train']
-    allowed_characters_re = re.compile(r'^[a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.]+$')
+    allowed_characters_re = re.compile(r'^[0-9a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.\']+$')
    def preprocess_word(word):
        word = word.lower()
        if not bool(allowed_characters_re.match(word)):