diff --git a/scripts/deduplicate_librilight_libritts.py b/scripts/deduplicate_librilight_libritts.py new file mode 100755 index 0000000..bb6e483 --- /dev/null +++ b/scripts/deduplicate_librilight_libritts.py @@ -0,0 +1,45 @@ +import os +import json + +librilight_dir = "LibriLight-6K" +libritts_dir = "LibriTTS-Train" + +librilight_data = {} +libritts_data = {} + +for speaker_id in os.listdir(f'./{librilight_dir}/'): + for filename in os.listdir(f'./{librilight_dir}/{speaker_id}'): + parts = filename.split("_") + book_id = parts[1] + subid = parts[2] + + if speaker_id not in librilight_data: + librilight_data[speaker_id] = {} + if book_id not in librilight_data[speaker_id]: + librilight_data[speaker_id][book_id] = [] + librilight_data[speaker_id][book_id].append(subid) + +for speaker_id in os.listdir(f'./{libritts_dir}/'): + for filename in os.listdir(f'./{libritts_dir}/{speaker_id}'): + parts = filename.split("_") + book_id = parts[1] + subid = parts[2] + + if speaker_id not in libritts_data: + libritts_data[speaker_id] = {} + if book_id not in libritts_data[speaker_id]: + libritts_data[speaker_id][book_id] = [] + libritts_data[speaker_id][book_id].append(subid) + +duplicates = [] + +for speaker_id, books in libritts_data.items(): + if speaker_id not in librilight_data: + continue + for book_id, _ in books.items(): + if book_id not in librilight_data[speaker_id]: + continue + print(f'Duplicate: {speaker_id}/{book_id}') + duplicates.append(f'{speaker_id}/{book_id}') + +print("Duplicates:", duplicates) \ No newline at end of file diff --git a/scripts/download_libritts-small.sh b/scripts/download_libritts-small.sh deleted file mode 100755 index 1f5ca9a..0000000 --- a/scripts/download_libritts-small.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# do not invoke directly in scripts -if [[ ${PWD##*/} == 'scripts' ]]; then - cd .. -fi - -# download training data -git clone https://huggingface.co/datasets/ecker/libritts-small ./data/libritts-small \ No newline at end of file diff --git a/scripts/prepare_libri.py b/scripts/prepare_libri.py deleted file mode 100755 index 0843308..0000000 --- a/scripts/prepare_libri.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import json - -for f in os.listdir(f'./data/librispeech_finetuning/1h/'): - for j in os.listdir(f'./data/librispeech_finetuning/1h/{f}/clean'): - for z in os.listdir(f'./data/librispeech_finetuning/1h/{f}/clean/{j}'): - for i in os.listdir(f'./data/librispeech_finetuning/1h/{f}/clean/{j}/{z}'): - os.rename(f'./data/librispeech_finetuning/1h/{f}/clean/{j}/{z}/{i}', f'./data/librilight-tts/{i}') - -for j in os.listdir('./data/librispeech_finetuning/9h/clean'): - for z in os.listdir(f'./data/librispeech_finetuning/9h/clean/{j}'): - for i in os.listdir(f'./data/librispeech_finetuning/9h/clean/{j}/{z}'): - os.rename(f'./data/librispeech_finetuning/9h/clean/{j}/{z}/{i}', f'./data/librilight-tts/{i}') - -lst = [] -for i in os.listdir('./data/librilight-tts/'): - try: - if 'trans' not in i: - continue - with open(f'./data/librilight-tts/{i}') as f: - for row in f: - z = row.split('-') - name = z[0]+'-'+z[1]+ '-' + z[2].split(' ')[0] - text = " ".join(z[2].split(' ')[1:]) - lst.append([name, text]) - except Exception as e: - pass - -for i in lst: - try: - with open(f'./data/librilight-tts/{i[0]}.txt', 'x') as file: - file.write(i[1]) - except: - with open(f'./data/librilight-tts/{i[0]}.txt', 'w+') as file: - file.write(i[1]) - -phoneme_map = {} -phoneme_transcript = {} - -with open('./data/librispeech_finetuning/phones/phones_mapping.json', 'r') as f: - phoneme_map_rev = json.load(f) - for k, v in phoneme_map_rev.items(): - phoneme_map[f'{v}'] = k - -with open('./data/librispeech_finetuning/phones/10h_phones.txt', 'r') as f: - lines = f.readlines() - for line in lines: - split = line.strip().split(" ") - key = split[0] - tokens = split[1:] - - phonemes = [] - for token in tokens: - phoneme = phoneme_map[f'{token}'] - phonemes.append( phoneme ) - - phoneme_transcript[key] = " ".join(phonemes) - -for filename in sorted(os.listdir('./data/librilight-tts')): - split = filename.split('.') - - key = split[0] - extension = split[1] # covers double duty of culling .normalized.txt and .phn.txt - - if extension != 'txt': - continue - - os.rename(f'./data/librilight-tts/{filename}', f'./data/librilight-tts/{key}.normalized.txt') - - if key in phoneme_transcript: - with open(f'./data/librilight-tts/{key}.phn.txt', 'w', encoding='utf-8') as f: - f.write(phoneme_transcript[key]) \ No newline at end of file diff --git a/scripts/prepare_libri.sh b/scripts/prepare_libri.sh deleted file mode 100755 index d044278..0000000 --- a/scripts/prepare_libri.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -# do not invoke directly in scripts -if [[ ${PWD##*/} == 'scripts' ]]; then - cd .. -fi - -# download training data -cd data -mkdir librilight-tts -if [ ! -e ./librispeech_finetuning.tgz ]; then - wget https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz -fi -tar -xzf librispeech_finetuning.tgz -cd .. - -# clean it up -python3 ./scripts/prepare_libri.py - -# convert to wav -pip3 install AudioConverter -audioconvert convert ./data/librilight-tts/ ./data/librilight-tts --output-format .wav - -# process data -ulimit -Sn `ulimit -Hn` # ROCm is a bitch -python3 -m vall_e.emb.g2p ./data/librilight-tts # phonemizes anything that might have been amiss in the phoneme transcription -python3 -m vall_e.emb.qnt ./data/librilight-tts \ No newline at end of file diff --git a/scripts/prepare_librilight.py b/scripts/prepare_librilight.py new file mode 100755 index 0000000..5f3ad8d --- /dev/null +++ b/scripts/prepare_librilight.py @@ -0,0 +1,32 @@ +import os +import json + +input_dataset = "small+medium" +output_dataset = "LibriLight-6K" + +for speaker_id in os.listdir(f'./{input_dataset}/'): + if not os.path.isdir(f'./{input_dataset}/{speaker_id}/'): + continue + for book_name in os.listdir(f'./{input_dataset}/{speaker_id}/'): + + subid = 0 + for filename in os.listdir(f'./{input_dataset}/{speaker_id}/{book_name}'): + if filename[-5:] != ".json": + continue + + basename = filename[:-5] + + json_path = f'./{input_dataset}/{speaker_id}/{book_name}/{basename}.json' + flac_path = f'./{input_dataset}/{speaker_id}/{book_name}/{basename}.flac' + + j = json.load(open(json_path, 'r', encoding="utf-8")) + id = j['book_meta']['id'] + + json_id_path = f'./{output_dataset}/{speaker_id}/{speaker_id}_{id}_{subid}.json' + flac_id_path = f'./{output_dataset}/{speaker_id}/{speaker_id}_{id}_{subid}.flac' + + os.makedirs(f'./{output_dataset}/{speaker_id}/', exist_ok=True) + os.rename(json_path, json_id_path) + os.rename(flac_path, flac_id_path) + + subid += 1 diff --git a/scripts/prepare_libritts.py b/scripts/prepare_libritts.py index c662fe3..d15c24b 100755 --- a/scripts/prepare_libritts.py +++ b/scripts/prepare_libritts.py @@ -1,18 +1,21 @@ import os import json -for f in os.listdir(f'./LibriTTS/'): - if not os.path.isdir(f'./LibriTTS/{f}/'): +input_dataset = "LibriTTS_R" +output_dataset = "LibriTTS-Train" + +for dataset_name in os.listdir(f'./{input_dataset}/'): + if not os.path.isdir(f'./{input_dataset}/{dataset_name}/'): continue - for j in os.listdir(f'./LibriTTS/{f}/'): - if not os.path.isdir(f'./LibriTTS/{f}/{j}'): + for speaker_id in os.listdir(f'./{input_dataset}/{dataset_name}/'): + if not os.path.isdir(f'./{input_dataset}/{dataset_name}/{speaker_id}'): continue - for z in os.listdir(f'./LibriTTS/{f}/{j}'): - if not os.path.isdir(f'./LibriTTS/{f}/{j}/{z}'): + for book_id in os.listdir(f'./{input_dataset}/{dataset_name}/{speaker_id}'): + if not os.path.isdir(f'./{input_dataset}/{dataset_name}/{speaker_id}/{book_id}'): continue - for i in os.listdir(f'./LibriTTS/{f}/{j}/{z}'): - if i[-4:] != ".wav": + for filename in os.listdir(f'./{input_dataset}/{dataset_name}/{speaker_id}/{book_id}'): + if filename[-4:] != ".wav": continue - os.makedirs(f'./LibriTTS-Train/{j}/', exist_ok=True) - os.rename(f'./LibriTTS/{f}/{j}/{z}/{i}', f'./LibriTTS-Train/{j}/{i}') \ No newline at end of file + os.makedirs(f'./{output_dataset}/{speaker_id}/', exist_ok=True) + os.rename(f'./{input_dataset}/{dataset_name}/{speaker_id}/{book_id}/{filename}', f'./{output_dataset}/{speaker_id}/{filename}') \ No newline at end of file diff --git a/scripts/process_libritts.py b/scripts/process_libritts.py new file mode 100755 index 0000000..8235f47 --- /dev/null +++ b/scripts/process_libritts.py @@ -0,0 +1,43 @@ +import os +import json +import torch + +from tqdm.auto import tqdm +from pathlib import Path +from vall_e.emb.g2p import encode as valle_phonemize +from vall_e.emb.qnt import encode_from_file as valle_quantize, _replace_file_extension + +input_dataset = "LibriTTS_R" +output_dataset = "LibriTTS-Train" +device = "cuda" + +txts = [] +wavs = [] + +for dataset_name in os.listdir(f'./{input_dataset}/'): + if not os.path.isdir(f'./{input_dataset}/{dataset_name}/'): + continue + + for speaker_id in tqdm(os.listdir(f'./{input_dataset}/{dataset_name}/'), desc="Processing speaker"): + if not os.path.isdir(f'./{input_dataset}/{dataset_name}/{speaker_id}'): + continue + + os.makedirs(f'./{output_dataset}/{speaker_id}/', exist_ok=True) + for book_id in os.listdir(f'./{input_dataset}/{dataset_name}/{speaker_id}'): + if not os.path.isdir(f'./{input_dataset}/{dataset_name}/{speaker_id}/{book_id}'): + continue + for filename in os.listdir(f'./{input_dataset}/{dataset_name}/{speaker_id}/{book_id}'): + os.rename(f'./{input_dataset}/{dataset_name}/{speaker_id}/{book_id}/{filename}', f'./{output_dataset}/{speaker_id}/{filename}') + + if ".original.txt" in filename: + txts.append(Path(f'./{output_dataset}/{speaker_id}/{filename}')) + if ".wav" in filename: + wavs.append(Path(f'./{output_dataset}/{speaker_id}/{filename}')) + +for path in tqdm(txts, desc="Phonemizing..."): + phones = valle_phonemize(open(path, "r", encoding="utf-8").read()) + open(_replace_file_extension(path, ".phn.txt"), "w", encoding="utf-8").write(" ".join(phones)) + +for path in tqdm(wavs, desc="Quantizing..."): + qnt = valle_quantize(path, device=device) + torch.save(qnt.cpu(), _replace_file_extension(path, ".qnt.pt"))