added helper scripts to process LibriTTS/LibriLight, detect duplicate speaker+books between them, and script to directly phonemize and quantize LibriTTS
This commit is contained in:
parent
16e0020901
commit
7b3be3d7bf
45
scripts/deduplicate_librilight_libritts.py
Executable file
45
scripts/deduplicate_librilight_libritts.py
Executable file
|
@ -0,0 +1,45 @@
|
|||
import os
|
||||
import json
|
||||
|
||||
librilight_dir = "LibriLight-6K"
|
||||
libritts_dir = "LibriTTS-Train"
|
||||
|
||||
librilight_data = {}
|
||||
libritts_data = {}
|
||||
|
||||
for speaker_id in os.listdir(f'./{librilight_dir}/'):
|
||||
for filename in os.listdir(f'./{librilight_dir}/{speaker_id}'):
|
||||
parts = filename.split("_")
|
||||
book_id = parts[1]
|
||||
subid = parts[2]
|
||||
|
||||
if speaker_id not in librilight_data:
|
||||
librilight_data[speaker_id] = {}
|
||||
if book_id not in librilight_data[speaker_id]:
|
||||
librilight_data[speaker_id][book_id] = []
|
||||
librilight_data[speaker_id][book_id].append(subid)
|
||||
|
||||
for speaker_id in os.listdir(f'./{libritts_dir}/'):
|
||||
for filename in os.listdir(f'./{libritts_dir}/{speaker_id}'):
|
||||
parts = filename.split("_")
|
||||
book_id = parts[1]
|
||||
subid = parts[2]
|
||||
|
||||
if speaker_id not in libritts_data:
|
||||
libritts_data[speaker_id] = {}
|
||||
if book_id not in libritts_data[speaker_id]:
|
||||
libritts_data[speaker_id][book_id] = []
|
||||
libritts_data[speaker_id][book_id].append(subid)
|
||||
|
||||
duplicates = []
|
||||
|
||||
for speaker_id, books in libritts_data.items():
|
||||
if speaker_id not in librilight_data:
|
||||
continue
|
||||
for book_id, _ in books.items():
|
||||
if book_id not in librilight_data[speaker_id]:
|
||||
continue
|
||||
print(f'Duplicate: {speaker_id}/{book_id}')
|
||||
duplicates.append(f'{speaker_id}/{book_id}')
|
||||
|
||||
print("Duplicates:", duplicates)
|
|
@ -1,9 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# do not invoke directly in scripts
|
||||
if [[ ${PWD##*/} == 'scripts' ]]; then
|
||||
cd ..
|
||||
fi
|
||||
|
||||
# download training data
|
||||
git clone https://huggingface.co/datasets/ecker/libritts-small ./data/libritts-small
|
|
@ -1,72 +0,0 @@
|
|||
import os
|
||||
import json
|
||||
|
||||
for f in os.listdir(f'./data/librispeech_finetuning/1h/'):
|
||||
for j in os.listdir(f'./data/librispeech_finetuning/1h/{f}/clean'):
|
||||
for z in os.listdir(f'./data/librispeech_finetuning/1h/{f}/clean/{j}'):
|
||||
for i in os.listdir(f'./data/librispeech_finetuning/1h/{f}/clean/{j}/{z}'):
|
||||
os.rename(f'./data/librispeech_finetuning/1h/{f}/clean/{j}/{z}/{i}', f'./data/librilight-tts/{i}')
|
||||
|
||||
for j in os.listdir('./data/librispeech_finetuning/9h/clean'):
|
||||
for z in os.listdir(f'./data/librispeech_finetuning/9h/clean/{j}'):
|
||||
for i in os.listdir(f'./data/librispeech_finetuning/9h/clean/{j}/{z}'):
|
||||
os.rename(f'./data/librispeech_finetuning/9h/clean/{j}/{z}/{i}', f'./data/librilight-tts/{i}')
|
||||
|
||||
lst = []
|
||||
for i in os.listdir('./data/librilight-tts/'):
|
||||
try:
|
||||
if 'trans' not in i:
|
||||
continue
|
||||
with open(f'./data/librilight-tts/{i}') as f:
|
||||
for row in f:
|
||||
z = row.split('-')
|
||||
name = z[0]+'-'+z[1]+ '-' + z[2].split(' ')[0]
|
||||
text = " ".join(z[2].split(' ')[1:])
|
||||
lst.append([name, text])
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
for i in lst:
|
||||
try:
|
||||
with open(f'./data/librilight-tts/{i[0]}.txt', 'x') as file:
|
||||
file.write(i[1])
|
||||
except:
|
||||
with open(f'./data/librilight-tts/{i[0]}.txt', 'w+') as file:
|
||||
file.write(i[1])
|
||||
|
||||
phoneme_map = {}
|
||||
phoneme_transcript = {}
|
||||
|
||||
with open('./data/librispeech_finetuning/phones/phones_mapping.json', 'r') as f:
|
||||
phoneme_map_rev = json.load(f)
|
||||
for k, v in phoneme_map_rev.items():
|
||||
phoneme_map[f'{v}'] = k
|
||||
|
||||
with open('./data/librispeech_finetuning/phones/10h_phones.txt', 'r') as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
split = line.strip().split(" ")
|
||||
key = split[0]
|
||||
tokens = split[1:]
|
||||
|
||||
phonemes = []
|
||||
for token in tokens:
|
||||
phoneme = phoneme_map[f'{token}']
|
||||
phonemes.append( phoneme )
|
||||
|
||||
phoneme_transcript[key] = " ".join(phonemes)
|
||||
|
||||
for filename in sorted(os.listdir('./data/librilight-tts')):
|
||||
split = filename.split('.')
|
||||
|
||||
key = split[0]
|
||||
extension = split[1] # covers double duty of culling .normalized.txt and .phn.txt
|
||||
|
||||
if extension != 'txt':
|
||||
continue
|
||||
|
||||
os.rename(f'./data/librilight-tts/{filename}', f'./data/librilight-tts/{key}.normalized.txt')
|
||||
|
||||
if key in phoneme_transcript:
|
||||
with open(f'./data/librilight-tts/{key}.phn.txt', 'w', encoding='utf-8') as f:
|
||||
f.write(phoneme_transcript[key])
|
|
@ -1,27 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# do not invoke directly in scripts
|
||||
if [[ ${PWD##*/} == 'scripts' ]]; then
|
||||
cd ..
|
||||
fi
|
||||
|
||||
# download training data
|
||||
cd data
|
||||
mkdir librilight-tts
|
||||
if [ ! -e ./librispeech_finetuning.tgz ]; then
|
||||
wget https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz
|
||||
fi
|
||||
tar -xzf librispeech_finetuning.tgz
|
||||
cd ..
|
||||
|
||||
# clean it up
|
||||
python3 ./scripts/prepare_libri.py
|
||||
|
||||
# convert to wav
|
||||
pip3 install AudioConverter
|
||||
audioconvert convert ./data/librilight-tts/ ./data/librilight-tts --output-format .wav
|
||||
|
||||
# process data
|
||||
ulimit -Sn `ulimit -Hn` # ROCm is a bitch
|
||||
python3 -m vall_e.emb.g2p ./data/librilight-tts # phonemizes anything that might have been amiss in the phoneme transcription
|
||||
python3 -m vall_e.emb.qnt ./data/librilight-tts
|
32
scripts/prepare_librilight.py
Executable file
32
scripts/prepare_librilight.py
Executable file
|
@ -0,0 +1,32 @@
|
|||
import os
|
||||
import json
|
||||
|
||||
input_dataset = "small+medium"
|
||||
output_dataset = "LibriLight-6K"
|
||||
|
||||
for speaker_id in os.listdir(f'./{input_dataset}/'):
|
||||
if not os.path.isdir(f'./{input_dataset}/{speaker_id}/'):
|
||||
continue
|
||||
for book_name in os.listdir(f'./{input_dataset}/{speaker_id}/'):
|
||||
|
||||
subid = 0
|
||||
for filename in os.listdir(f'./{input_dataset}/{speaker_id}/{book_name}'):
|
||||
if filename[-5:] != ".json":
|
||||
continue
|
||||
|
||||
basename = filename[:-5]
|
||||
|
||||
json_path = f'./{input_dataset}/{speaker_id}/{book_name}/{basename}.json'
|
||||
flac_path = f'./{input_dataset}/{speaker_id}/{book_name}/{basename}.flac'
|
||||
|
||||
j = json.load(open(json_path, 'r', encoding="utf-8"))
|
||||
id = j['book_meta']['id']
|
||||
|
||||
json_id_path = f'./{output_dataset}/{speaker_id}/{speaker_id}_{id}_{subid}.json'
|
||||
flac_id_path = f'./{output_dataset}/{speaker_id}/{speaker_id}_{id}_{subid}.flac'
|
||||
|
||||
os.makedirs(f'./{output_dataset}/{speaker_id}/', exist_ok=True)
|
||||
os.rename(json_path, json_id_path)
|
||||
os.rename(flac_path, flac_id_path)
|
||||
|
||||
subid += 1
|
|
@ -1,18 +1,21 @@
|
|||
import os
|
||||
import json
|
||||
|
||||
for f in os.listdir(f'./LibriTTS/'):
|
||||
if not os.path.isdir(f'./LibriTTS/{f}/'):
|
||||
input_dataset = "LibriTTS_R"
|
||||
output_dataset = "LibriTTS-Train"
|
||||
|
||||
for dataset_name in os.listdir(f'./{input_dataset}/'):
|
||||
if not os.path.isdir(f'./{input_dataset}/{dataset_name}/'):
|
||||
continue
|
||||
for j in os.listdir(f'./LibriTTS/{f}/'):
|
||||
if not os.path.isdir(f'./LibriTTS/{f}/{j}'):
|
||||
for speaker_id in os.listdir(f'./{input_dataset}/{dataset_name}/'):
|
||||
if not os.path.isdir(f'./{input_dataset}/{dataset_name}/{speaker_id}'):
|
||||
continue
|
||||
for z in os.listdir(f'./LibriTTS/{f}/{j}'):
|
||||
if not os.path.isdir(f'./LibriTTS/{f}/{j}/{z}'):
|
||||
for book_id in os.listdir(f'./{input_dataset}/{dataset_name}/{speaker_id}'):
|
||||
if not os.path.isdir(f'./{input_dataset}/{dataset_name}/{speaker_id}/{book_id}'):
|
||||
continue
|
||||
for i in os.listdir(f'./LibriTTS/{f}/{j}/{z}'):
|
||||
if i[-4:] != ".wav":
|
||||
for filename in os.listdir(f'./{input_dataset}/{dataset_name}/{speaker_id}/{book_id}'):
|
||||
if filename[-4:] != ".wav":
|
||||
continue
|
||||
|
||||
os.makedirs(f'./LibriTTS-Train/{j}/', exist_ok=True)
|
||||
os.rename(f'./LibriTTS/{f}/{j}/{z}/{i}', f'./LibriTTS-Train/{j}/{i}')
|
||||
os.makedirs(f'./{output_dataset}/{speaker_id}/', exist_ok=True)
|
||||
os.rename(f'./{input_dataset}/{dataset_name}/{speaker_id}/{book_id}/{filename}', f'./{output_dataset}/{speaker_id}/{filename}')
|
43
scripts/process_libritts.py
Executable file
43
scripts/process_libritts.py
Executable file
|
@ -0,0 +1,43 @@
|
|||
import os
|
||||
import json
|
||||
import torch
|
||||
|
||||
from tqdm.auto import tqdm
|
||||
from pathlib import Path
|
||||
from vall_e.emb.g2p import encode as valle_phonemize
|
||||
from vall_e.emb.qnt import encode_from_file as valle_quantize, _replace_file_extension
|
||||
|
||||
input_dataset = "LibriTTS_R"
|
||||
output_dataset = "LibriTTS-Train"
|
||||
device = "cuda"
|
||||
|
||||
txts = []
|
||||
wavs = []
|
||||
|
||||
for dataset_name in os.listdir(f'./{input_dataset}/'):
|
||||
if not os.path.isdir(f'./{input_dataset}/{dataset_name}/'):
|
||||
continue
|
||||
|
||||
for speaker_id in tqdm(os.listdir(f'./{input_dataset}/{dataset_name}/'), desc="Processing speaker"):
|
||||
if not os.path.isdir(f'./{input_dataset}/{dataset_name}/{speaker_id}'):
|
||||
continue
|
||||
|
||||
os.makedirs(f'./{output_dataset}/{speaker_id}/', exist_ok=True)
|
||||
for book_id in os.listdir(f'./{input_dataset}/{dataset_name}/{speaker_id}'):
|
||||
if not os.path.isdir(f'./{input_dataset}/{dataset_name}/{speaker_id}/{book_id}'):
|
||||
continue
|
||||
for filename in os.listdir(f'./{input_dataset}/{dataset_name}/{speaker_id}/{book_id}'):
|
||||
os.rename(f'./{input_dataset}/{dataset_name}/{speaker_id}/{book_id}/{filename}', f'./{output_dataset}/{speaker_id}/{filename}')
|
||||
|
||||
if ".original.txt" in filename:
|
||||
txts.append(Path(f'./{output_dataset}/{speaker_id}/{filename}'))
|
||||
if ".wav" in filename:
|
||||
wavs.append(Path(f'./{output_dataset}/{speaker_id}/{filename}'))
|
||||
|
||||
for path in tqdm(txts, desc="Phonemizing..."):
|
||||
phones = valle_phonemize(open(path, "r", encoding="utf-8").read())
|
||||
open(_replace_file_extension(path, ".phn.txt"), "w", encoding="utf-8").write(" ".join(phones))
|
||||
|
||||
for path in tqdm(wavs, desc="Quantizing..."):
|
||||
qnt = valle_quantize(path, device=device)
|
||||
torch.save(qnt.cpu(), _replace_file_extension(path, ".qnt.pt"))
|
Loading…
Reference in New Issue
Block a user