From 191e0130ee1eb00b7c43650824846799c8048c03 Mon Sep 17 00:00:00 2001 From: James Betker Date: Wed, 22 Dec 2021 18:30:50 -0700 Subject: [PATCH] Another fix --- codes/data/audio/paired_voice_audio_dataset.py | 8 +++----- codes/data/audio/voice_tokenizer_builder.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/codes/data/audio/paired_voice_audio_dataset.py b/codes/data/audio/paired_voice_audio_dataset.py index df4f1537..d9725c60 100644 --- a/codes/data/audio/paired_voice_audio_dataset.py +++ b/codes/data/audio/paired_voice_audio_dataset.py @@ -221,8 +221,8 @@ if __name__ == '__main__': batch_sz = 8 params = { 'mode': 'paired_voice_audio', - 'path': ['Z:\\bigasr_dataset\\libritts\\test-clean_list.txt'], - 'fetcher_mode': ['libritts'], + 'path': ['Z:\\clips\\podcasts-0-transcribed.tsv'], + 'fetcher_mode': ['tsv'], 'phase': 'train', 'n_workers': 0, 'batch_size': batch_sz, @@ -230,9 +230,7 @@ if __name__ == '__main__': 'max_wav_length': 255995, 'max_text_length': 200, 'sample_rate': 22050, - 'load_conditioning': True, - 'num_conditioning_candidates': 3, - 'conditioning_length': 44100, + 'load_conditioning': False, } from data import create_dataset, create_dataloader diff --git a/codes/data/audio/voice_tokenizer_builder.py b/codes/data/audio/voice_tokenizer_builder.py index 3793bffc..4b4a2f24 100644 --- a/codes/data/audio/voice_tokenizer_builder.py +++ b/codes/data/audio/voice_tokenizer_builder.py @@ -35,7 +35,7 @@ def train(): bcd = datasets.load_dataset('bookcorpus', cache_dir='Z:\\huggingface_datasets\\cache')['train'] wkd = datasets.load_dataset('wikipedia', '20200501.en', cache_dir='Z:\\huggingface_datasets\\cache')['train'] - allowed_characters_re = re.compile(r'^[a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.]+$') + allowed_characters_re = re.compile(r'^[0-9a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.\']+$') def preprocess_word(word): word = word.lower() if not bool(allowed_characters_re.match(word)):