final tweaks, hopefully, again

This commit is contained in:
mrq 2024-05-15 23:04:19 -05:00
parent 8d79f78e0a
commit d9aabfa3ae
4 changed files with 450 additions and 570 deletions

View File

@ -41,9 +41,7 @@
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"pre_tokenizer": null,
"post_processor": {
"type": "TemplateProcessing",
"single": [
@ -110,263 +108,264 @@
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<unk>": 0,
"<bos>": 1,
"</eos>": 2,
"<mask>": 3,
"!": 4,
"\"": 5,
"(": 6,
")": 7,
",": 8,
"-": 9,
".": 10,
"1": 11,
":": 12,
"?": 13,
"a": 14,
"b": 15,
"c": 16,
"d": 17,
"e": 18,
"f": 19,
"h": 20,
"i": 21,
"j": 22,
"k": 23,
"l": 24,
"m": 25,
"n": 26,
"o": 27,
"p": 28,
"q": 29,
"r": 30,
"s": 31,
"t": 32,
"u": 33,
"v": 34,
"w": 35,
"x": 36,
"z": 37,
"¡": 38,
"«": 39,
"»": 40,
"¿": 41,
"æ": 42,
"ç": 43,
"ð": 44,
"ŋ": 45,
"ɐ": 46,
"ɑ": 47,
"ɔ": 48,
"ɕ": 49,
"ə": 50,
"ɚ": 51,
"ɛ": 52,
"ɜ": 53,
"ɟ": 54,
"ɡ": 55,
"ɪ": 56,
"ɬ": 57,
"ɯ": 58,
"ɹ": 59,
"ɾ": 60,
"ʃ": 61,
"ʈ": 62,
"ʊ": 63,
"ʋ": 64,
"ʌ": 65,
"ʑ": 66,
"ʒ": 67,
"ʔ": 68,
"ʲ": 69,
"ˈ": 70,
"ˌ": 71,
"ː": 72,
"̃": 73,
"̩": 74,
"θ": 75,
"": 76,
"": 77,
"": 78,
"ˈɛ": 79,
"iː": 80,
"aɪ": 81,
"nd": 82,
"ˈɪ": 83,
"eɪ": 84,
"ˈæ": 85,
"": 86,
"ðə": 87,
"ɑː": 88,
"ˈeɪ": 89,
"ən": 90,
"uː": 91,
"ˈʌ": 92,
"ˈaɪ": 93,
"st": 94,
"ˈɔ": 95,
"ˈ": 96,
"ˈiː": 97,
"ˈɑː": 98,
"ænd": 99,
"ːɹ": 100,
"ɪŋ": 101,
"ɜː": 102,
"ɪn": 103,
"": 104,
"ʌv": 105,
"": 106,
"əl": 107,
"ˈuː": 108,
"": 109,
"ɪz": 110,
"ˈɜː": 111,
"ˌʌ": 112,
"æt": 113,
"": 114,
"ˈɔː": 115,
"ɪt": 116,
"ˈ": 117,
"ɚɹ": 118,
"ˈɛn": 119,
"": 120,
"li": 121,
"hiː": 122,
"ˌɛ": 123,
"wɪ": 124,
"ðæt": 125,
"wʌz": 126,
"juː": 127,
"oːɹ": 128,
"ðɪ": 129,
"sˈɛ": 130,
"ˈɑːɹ": 131,
"ˌɪ": 132,
"nt": 133,
"ˈʊ": 134,
"ənt": 135,
"hɪz": 136,
"": 137,
"ˌɑː": 138,
"ɔːɹ": 139,
"ˈɛɹ": 140,
"wɪð": 141,
"ᵻd": 142,
"ˈoːɹ": 143,
"pɹ": 144,
"ˈɔːl": 145,
"": 146,
"ʃən": 147,
"kt": 148,
"ˌoʊ": 149,
"ˈɔːɹ": 150,
"": 151,
"æz": 152,
"ʃiː": 153,
"ˌʌt": 154,
"ˈɛl": 155,
"ˌaʊ": 156,
"ˈʌn": 157,
"əs": 158,
"ː": 159,
"lˈaɪ": 160,
"ˈæn": 161,
"ˈɪɹ": 162,
"ʊd": 163,
"ɹᵻ": 164,
"ld": 165,
"bˌʌt": 166,
"ks": 167,
"nˈ": 168,
"ɾɚ": 169,
"hæd": 170,
"ɛɹ": 171,
"ˈɪŋ": 172,
"ɡɹ": 173,
"ɔn": 174,
"ɑː": 175,
"maɪ": 176,
"": 177,
"ːɹ": 178,
"ðɚ": 179,
"": 180,
"ðɛɹ": 181,
"ˈʌm": 182,
"ɑːt": 183,
"tɹ": 184,
"sˈiː": 185,
"ʌvðə": 186,
"mˈɪ": 187,
"ˈæp": 188,
"ˌɪm": 189,
"ɪk": 190,
"sp": 191,
"lˈeɪ": 192,
"hˌɪm": 193,
"ɐn": 194,
"ðeɪ": 195,
"lˈɪ": 196,
"ɾi": 197,
"": 198,
"lˈɛ": 199,
"": 200,
"ˈɪl": 201,
"jˈuː": 202,
"ʌm": 203,
"mˌiː": 204,
" ": 4,
"!": 5,
"\"": 6,
"(": 7,
")": 8,
",": 9,
"-": 10,
".": 11,
"1": 12,
":": 13,
";": 14,
"?": 15,
"a": 16,
"b": 17,
"c": 18,
"d": 19,
"e": 20,
"f": 21,
"h": 22,
"i": 23,
"j": 24,
"k": 25,
"l": 26,
"m": 27,
"n": 28,
"o": 29,
"p": 30,
"q": 31,
"r": 32,
"s": 33,
"t": 34,
"u": 35,
"v": 36,
"w": 37,
"x": 38,
"z": 39,
"¡": 40,
"«": 41,
"»": 42,
"¿": 43,
"æ": 44,
"ç": 45,
"ð": 46,
"ŋ": 47,
"ɐ": 48,
"ɑ": 49,
"ɔ": 50,
"ɕ": 51,
"ə": 52,
"ɚ": 53,
"ɛ": 54,
"ɜ": 55,
"ɟ": 56,
"ɡ": 57,
"ɪ": 58,
"ɬ": 59,
"ɯ": 60,
"ɹ": 61,
"ɾ": 62,
"ʃ": 63,
"ʈ": 64,
"ʊ": 65,
"ʋ": 66,
"ʌ": 67,
"ʑ": 68,
"ʒ": 69,
"ʔ": 70,
"ʲ": 71,
"ˈ": 72,
"ˌ": 73,
"ː": 74,
"̃": 75,
"̩": 76,
"θ": 77,
"": 78,
"": 79,
"": 80,
"": 81,
"": 82,
"ˈɛ": 83,
"iː": 84,
"aɪ": 85,
"nd": 86,
"ˈɪ": 87,
"eɪ": 88,
"ˈæ": 89,
"ðə": 90,
"": 91,
"ɑː": 92,
"ˈeɪ": 93,
"ən": 94,
"uː": 95,
"ˈʌ": 96,
"ˈaɪ": 97,
"st": 98,
"ˈɔ": 99,
"ˈ": 100,
"ˈiː": 101,
"ˈɑː": 102,
"ænd": 103,
"ːɹ": 104,
"ɪŋ": 105,
"ɜː": 106,
"ɪn": 107,
"": 108,
"ʌv": 109,
"": 110,
"əl": 111,
"ˈuː": 112,
"tʃ": 113,
"ɪz": 114,
"ˈɜː": 115,
"ˌʌ": 116,
"æt": 117,
"": 118,
"ˈɔː": 119,
"ɪt": 120,
"ˈ": 121,
"ɚɹ": 122,
"ˈɛn": 123,
"wʌ": 124,
"li": 125,
"hiː": 126,
"ˌɛ": 127,
"wɪ": 128,
"wʌz": 129,
"ðæt": 130,
"juː": 131,
"oːɹ": 132,
"ðɪ": 133,
"sˈɛ": 134,
"ˌɪ": 135,
"ˈɑːɹ": 136,
"nt": 137,
"ˈʊ": 138,
"ənt": 139,
"hɪz": 140,
"ˌɑː": 141,
"": 142,
"ɔːɹ": 143,
"ˈɛɹ": 144,
"wɪð": 145,
"ᵻd": 146,
"ˈoːɹ": 147,
"": 148,
"ˈɔːl": 149,
"": 150,
"ʃən": 151,
"kt": 152,
"ˌoʊ": 153,
"ˈɔːɹ": 154,
"": 155,
"æz": 156,
"ˌʌt": 157,
"ʃiː": 158,
"ˈɛl": 159,
"ˌaʊ": 160,
"ˈʌn": 161,
"əs": 162,
"ː": 163,
"lˈaɪ": 164,
"ˈæn": 165,
"ˈɪɹ": 166,
"ʊd": 167,
"ɹᵻ": 168,
"ld": 169,
"bˌʌt": 170,
"ks": 171,
"nˈ": 172,
"hæd": 173,
"ɾɚ": 174,
"ɛɹ": 175,
"ˈɪŋ": 176,
"ɡɹ": 177,
"ɑː": 178,
"ɔn": 179,
"": 180,
"maɪ": 181,
"ːɹ": 182,
"ðɚ": 183,
"tʊ": 184,
"ðɛɹ": 185,
"ɑːt": 186,
"ˈʌm": 187,
"": 188,
"sˈiː": 189,
"ʌvðə": 190,
"mˈɪ": 191,
"hˈæ": 192,
"ˌɪm": 193,
"lˈeɪ": 194,
"ɪk": 195,
"sp": 196,
"ɪm": 197,
"ɐn": 198,
"ðeɪ": 199,
"lˈɪ": 200,
"ɾi": 201,
"lˈɛ": 202,
"": 203,
"": 204,
"lˈæ": 205,
"ˌɪn": 206,
"bᵻ": 207,
"wˈʌn": 208,
"ˈɪn": 209,
"ˈoʊn": 210,
"biː": 211,
"sˈɛd": 212,
"ˈɛd": 213,
"ˈaɪt": 214,
"fɹʌm": 215,
"baɪ": 216,
"ɪs": 217,
"ɚz": 218,
"ðɪs": 219,
"əns": 220,
"ɪf": 221,
"bəl": 222,
"ˈænd": 223,
"ɪnðə": 224,
"əm": 225,
"iːz": 226,
"ˌuː": 227,
"ᵻz": 228,
"wˈeɪ": 229,
"ft": 230,
"wiː": 231,
"lˈiː": 232,
"stɹ": 233,
"": 234,
"ɚd": 235,
"ˌaɪ": 236,
"kw": 237,
"ˌɔn": 238,
"ˈaɪd": 239,
"ts": 240,
"ɪm": 241,
"ˈʌst": 242,
"ˈoʊld": 243,
"ˌɪ": 244,
"dˈɪ": 245,
"sˌoʊ": 246,
"ɑːɹ": 247,
"": 248,
"sˈeɪ": 249,
"ɾᵻd": 250,
"dᵻ": 251,
"ɪ": 252,
"sˈɛl": 253,
"ɹi": 254,
"ˈʌðɚ": 255
"ˈɪl": 206,
"jˈuː": 207,
"ʌm": 208,
"mˌiː": 209,
"bᵻ": 210,
"wˈʌn": 211,
"ˌɪn": 212,
"ˈɪn": 213,
"ˈoʊn": 214,
"sˈɛd": 215,
"biː": 216,
"ˈɛd": 217,
"ˈaɪt": 218,
"baɪ": 219,
"fɹʌm": 220,
"ɪs": 221,
"ɚz": 222,
"ðɪs": 223,
"əns": 224,
"bəl": 225,
"ɪf": 226,
"ɪnðə": 227,
"əm": 228,
"ᵻz": 229,
"ˌuː": 230,
"wˈeɪ": 231,
"ft": 232,
"wiː": 233,
"stɹ": 234,
"lˈiː": 235,
"iːz": 236,
"pt": 237,
"": 238,
"ɚd": 239,
"ˌaɪ": 240,
"kw": 241,
"ˌɔn": 242,
"ˈaɪd": 243,
"ɪm": 244,
"ˈʌst": 245,
"ˈoʊld": 246,
"ts": 247,
"ˌɪ": 248,
"sˌoʊ": 249,
"dˈɪ": 250,
"ɑːɹ": 251,
"": 252,
"sˈeɪ": 253,
"ɾᵻd": 254,
"ɪ": 255
},
"merges": [
"ˈ ɛ",
@ -376,8 +375,8 @@
"ˈ ɪ",
"e ɪ",
"ˈ æ",
"o ʊ",
"ð ə",
"o ʊ",
"ɑ ː",
"ˈ eɪ",
"ə n",
@ -415,20 +414,20 @@
"h iː",
"ˌ ɛ",
"w ɪ",
"ð æt",
"wʌ z",
"ð æt",
"j uː",
"o ːɹ",
ɪ",
"s ˈɛ",
"ˈɑː ɹ",
ɪ",
"ˈɑː ɹ",
"n t",
"ˈ ʊ",
"ən t",
"h ɪz",
"h æ",
ɑː",
"h æ",
"ɔ ːɹ",
"ˈɛ ɹ",
"wɪ ð",
@ -443,8 +442,8 @@
"ˈɔ ːɹ",
"f ɹ",
"æ z",
"ʃ iː",
"ˌʌ t",
"ʃ iː",
"ˈɛ l",
"ˌ aʊ",
"ˈʌ n",
@ -459,93 +458,89 @@
"b ˌʌt",
"k s",
"n ˈoʊ",
"ɾ ɚ",
"hæ d",
"ɾ ɚ",
"ɛ ɹ",
"ˈɪ ŋ",
"ɡ ɹ",
"ɔ n",
"n ˌɑː",
"m aɪ",
"ɔ n",
"v ɚ",
"m aɪ",
"f ɔːɹ",
"ð ɚ",
"t ʊ",
"ð ɛɹ",
"ˈʌ m",
"nˌɑː t",
"ˈʌ m",
"t ɹ",
"s ˈiː",
"ʌv ðə",
"m ˈɪ",
"ˈæ p",
"h ˈæ",
"ˌɪ m",
"l ˈeɪ",
"ɪ k",
"s p",
"l ˈeɪ",
"h ˌɪm",
"ɐ n",
"ð eɪ",
"l ˈɪ",
"ɾ i",
"b ɹ",
"l ˈɛ",
"b ɹ",
"k ɹ",
"l ˈæ",
"ˈɪ l",
"j ˈuː",
"ʌ m",
"mˌ iː",
"l ˈæ",
ɪn",
"b ᵻ",
"w ˈʌn",
ɪn",
"ˈɪ n",
"ˈoʊ n",
"b iː",
"sˈɛ d",
"b iː",
"ˈɛ d",
"ˈaɪ t",
"fɹ ʌm",
"b aɪ",
"fɹ ʌm",
"ɪ s",
"ɚ z",
"ðɪ s",
"ən s",
"ɪ f",
"b əl",
"ˈæ nd",
"ɪ f",
"ɪn ðə",
"ə m",
"iː z",
"ˌ uː",
"ᵻ z",
"ˌ uː",
"w ˈeɪ",
"f t",
"w iː",
"l ˈiː",
"st ɹ",
"l ˈiː",
"iː z",
"p t",
"j ʊ",
"ɚ d",
"ˌ aɪ",
"k w",
"ˌ ɔn",
"ˈaɪ d",
"t s",
"ɪ m",
"ˈʌ st",
"ˈoʊ ld",
"t s",
"ˌɪ tʃ",
"d ˈɪ",
"s ˌoʊ",
"d ˈɪ",
"ɑː ɹ",
"h ɐ",
"s ˈeɪ",
"ɾ ᵻd",
"d ᵻ",
"w ˌɪtʃ",
"sˈɛ l",
"ɹ i",
"ˈʌ ðɚ"
"w ˌɪtʃ"
]
}
}

View File

@ -6,16 +6,22 @@ import torchaudio
from tqdm.auto import tqdm
from pathlib import Path
from vall_e.config import cfg
# things that could be args
cfg.sample_rate = 44_000
cfg.inference.audio_backend = "dac"
"""
cfg.inference.weight_dtype = "bfloat16"
cfg.inference.dtype = torch.bfloat16
cfg.inference.amp = True
"""
from vall_e.emb.g2p import encode as valle_phonemize
from vall_e.emb.qnt import encode as valle_quantize, _replace_file_extension
# things that could be args
cfg.sample_rate = 24_000
cfg.inference.audio_backend = "encodec"
input_audio = "voices"
input_metadata = "./training/metadata"
output_dataset = f"./training/data-{'2' if cfg.sample_rate == 24_000 else '4'}4KHz-{cfg.inference.audio_backend}"
input_metadata = "metadata"
output_dataset = f"training-{'2' if cfg.sample_rate == 24_000 else '4'}4KHz-{cfg.inference.audio_backend}"
device = "cuda"
audio_extension = ".dac" if cfg.inference.audio_backend == "dac" else ".enc"
@ -34,9 +40,6 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
if not os.path.isdir(f'./{input_audio}/{dataset_name}/'):
print("Is not dir:", f'./{input_audio}/{dataset_name}/')
continue
if dataset_name in ["LibriVox", "Audiobooks"]:
continue
for speaker_id in tqdm(sorted(os.listdir(f'./{input_audio}/{dataset_name}/')), desc=f"Processing speaker in {dataset_name}"):
if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'):
@ -55,10 +58,29 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
waveform, sample_rate = torchaudio.load(inpath)
qnt = valle_quantize(waveform, sr=sample_rate, device=device)
if cfg.inference.audio_backend == "dac":
qnt.save(_replace_file_extension(outpath, audio_extension))
np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
"codes": qnt.codes.numpy().astype(np.uint16),
"metadata": {
"original_length": qnt.original_length,
"sample_rate": qnt.sample_rate,
"input_db": qnt.input_db.numpy().astype(np.float32),
"chunk_length": qnt.chunk_length,
"channels": qnt.channels,
"padding": qnt.padding,
"dac_version": "1.0.0",
},
})
else:
torch.save( qnt, _replace_file_extension(outpath, audio_extension) )
np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
"codes": qnt.numpy().astype(np.uint16),
"metadata": {
"original_length": waveform.shape[-1],
"sample_rate": sample_rate,
},
})
continue
@ -91,7 +113,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
fname = filename.replace(f'.{extension}', "")
waveform, sample_rate = None, None
language = metadata[filename]["language"] if "language" in metadata[filename] else "english"
language = metadata[filename]["language"] if "language" in metadata[filename] else "en"
if len(metadata[filename]["segments"]) == 0 or not use_slices:
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}.{extension}')
@ -100,86 +122,101 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
if len(text) == 0:
continue
if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists():
if _replace_file_extension(outpath, audio_extension).exists():
continue
if not _replace_file_extension(outpath, ".json").exists():
txts.append((
outpath,
text,
language,
))
if not _replace_file_extension(outpath, audio_extension).exists():
if waveform is None:
waveform, sample_rate = torchaudio.load(inpath)
if waveform is None:
waveform, sample_rate = torchaudio.load(inpath)
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
wavs.append((
outpath,
waveform,
sample_rate
))
wavs.append((
outpath,
text,
language,
waveform,
sample_rate
))
else:
i = 0
for segment in metadata[filename]["segments"]:
id = pad(i, 4)
i = i + 1
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}')
if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists():
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}')
text = metadata[filename]["text"]
if len(text) == 0:
continue
if not _replace_file_extension(outpath, ".json").exists():
txts.append((
outpath,
segment["text"],
language,
))
if not _replace_file_extension(outpath, audio_extension).exists():
if waveform is None:
waveform, sample_rate = torchaudio.load(inpath)
if _replace_file_extension(outpath, audio_extension).exists():
continue
start = int(segment['start'] * sample_rate)
end = int(segment['end'] * sample_rate)
if waveform is None:
waveform, sample_rate = torchaudio.load(inpath)
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
if start < 0:
start = 0
if end >= waveform.shape[-1]:
end = waveform.shape[-1] - 1
start = int(segment['start'] * sample_rate)
end = int(segment['end'] * sample_rate)
if end - start < 0:
continue
if start < 0:
start = 0
if end >= waveform.shape[-1]:
end = waveform.shape[-1] - 1
wavs.append((
outpath,
waveform[:, start:end],
sample_rate
))
if end - start < 0:
continue
if len(txts) > 0:
for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}", disable=True):
outpath, text, language = job
phones = valle_phonemize(text)
data = {
"text": text.strip(),
"phonemes": phones,
"language": language,
}
open(_replace_file_extension(outpath, ".json"), 'w', encoding='utf-8').write(json.dumps(data))
wavs.append((
outpath,
text,
language,
waveform[:, start:end],
sample_rate
))
if len(wavs) > 0:
for job in tqdm(wavs, desc=f"Quantizing: {speaker_id}"):
try:
outpath, waveform, sample_rate = job
outpath, text, language, waveform, sample_rate = job
phones = valle_phonemize(text)
qnt = valle_quantize(waveform, sr=sample_rate, device=device)
if cfg.inference.audio_backend == "dac":
qnt.save(_replace_file_extension(outpath, audio_extension))
np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
"codes": qnt.codes.numpy().astype(np.uint16),
"metadata": {
"original_length": qnt.original_length,
"sample_rate": qnt.sample_rate,
"input_db": qnt.input_db.numpy().astype(np.float32),
"chunk_length": qnt.chunk_length,
"channels": qnt.channels,
"padding": qnt.padding,
"dac_version": "1.0.0",
"text": text.strip(),
"phonemes": "".join(phones),
"language": language,
},
})
else:
torch.save( qnt, _replace_file_extension(outpath, audio_extension) )
np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
"codes": qnt.numpy().astype(np.uint16),
"metadata": {
"original_length": waveform.shape[-1],
"sample_rate": sample_rate,
"text": text.strip(),
"phonemes": "".join(phones),
"language": language,
},
})
except Exception as e:
print(f"Failed to quantize: {outpath}:", e)
continue
open("./missing.json", 'w', encoding='utf-8').write(json.dumps(missing))
open("./dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset))
open("./dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset))

View File

@ -38,20 +38,23 @@ else:
metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/{id}')
metadata = json.loads(open(metadata_path, "r", encoding="utf-8").read())
if "phonemes" not in metadata:
continue
tokenizer_data.append( f'{"".join(metadata["phonemes"])}' )
open(output_file, 'w', encoding='utf-8').write(json.dumps(tokenizer_data))
unk_token = "<unk>"
spl_tokens = ["<bos>", "</eos>", unk_token, "<mask>"]
spl_tokens = [unk_token, "<bos>", "</eos>", "<mask>", "<space>"]
trainer = BpeTrainer(special_tokens = spl_tokens, vocab_size = 256)
tokenizer = Tokenizer(BPE(unk_token = unk_token))
tokenizer.pre_tokenizer = Whitespace()
tokenizer.pre_tokenizer = Whitespace() # takes 2 hours to process without this, we'll just manually add spaces as a token
tokenizer.post_processor = TemplateProcessing(
single="<bos> $A <eos>",
special_tokens=[("<bos>", 1), ("<eos>", 2)],
)
tokenizer.train_from_iterator(tokenizer_data, trainer=trainer)
tokenizer.save("./training/tokenizer.json")
tokenizer.save("./training/tokenizer_training_data.json")

View File

@ -86,19 +86,15 @@ def _calculate_durations( type="training" ):
def _load_paths(dataset, type="training"):
return { cfg.get_spkr( cfg.data_dir / data_dir / "dummy" ): _load_paths_from_metadata( data_dir, type=type, validate=cfg.dataset.validate and type == "training" ) for data_dir in tqdm(dataset, desc=f"Parsing dataset: {type}") }
def _load_paths_from_metadata(dataset_name, type="training", validate=False):
data_dir = dataset_name if cfg.dataset.use_hdf5 else cfg.data_dir / dataset_name
def _load_paths_from_metadata(group_name, type="training", validate=False):
data_dir = group_name if cfg.dataset.use_hdf5 else cfg.data_dir / group_name
_fn = _get_hdf5_paths if cfg.dataset.use_hdf5 else _get_paths_of_extensions
def key( id ):
if not cfg.dataset.use_hdf5:
return data_dir / id
def key( id, entry=None ):
return f"/{type}/{_get_hdf5_path(data_dir)}/{id}" if cfg.dataset.use_hdf5 else data_dir / id
return f"/{type}/{_get_hdf5_path(data_dir)}/{id}"
metadata_path = cfg.metadata_dir / f'{dataset_name}.json'
metadata_path = cfg.metadata_dir / f'{group_name}.json'
metadata = {}
if cfg.dataset.use_metadata and metadata_path.exists():
@ -107,10 +103,7 @@ def _load_paths_from_metadata(dataset_name, type="training", validate=False):
if len(metadata) == 0:
return _fn( data_dir, type if cfg.dataset.use_hdf5 else _get_quant_extension(), validate )
def _validate( id ):
entry = metadata[id]
def _validate( id, entry ):
phones = entry['phones'] if "phones" in entry else 0
duration = entry['duration'] if "duration" in entry else 0
if type not in _total_durations:
@ -118,14 +111,16 @@ def _load_paths_from_metadata(dataset_name, type="training", validate=False):
_total_durations[type] += duration
"""
if cfg.dataset.use_hdf5:
k = key( id )
if k not in cfg.hdf5 or "audio" not in cfg.hdf5[k] or "text" not in cfg.hdf5[k]:
return False
"""
return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration #and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
return [ key(id) for id in metadata.keys() if not validate or _validate(id) ]
return [ key(id, entry) for id, entry in metadata.items() if not validate or _validate(id, entry) ]
def _get_hdf5_path(path):
@ -136,16 +131,16 @@ def _get_hdf5_path(path):
def _get_hdf5_paths( data_dir, type="training", validate=False ):
data_dir = str(data_dir)
def _validate( child ):
phones = child.attrs['phonemes']
duration = child.attrs['duration']
def _validate( id, entry ):
phones = entry.attrs['phonemes']
duration = entry.attrs['duration']
if type not in _total_durations:
_total_durations[type] = 0
_total_durations[type] += child.attrs['duration']
return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
_total_durations[type] += entry.attrs['duration']
return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration #and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
key = f"/{type}/{_get_hdf5_path(data_dir)}"
return [ Path(f"{key}/{child.attrs['id']}") for child in cfg.hdf5[key].values() if not validate or _validate(child) ] if key in cfg.hdf5 else []
return [ Path(f"{key}/{id}") for id, entry in cfg.hdf5[key].items() if not validate or _validate(id, entry) ] if key in cfg.hdf5 else []
def _get_paths_of_extensions( path, extensions=_get_quant_extension(), validate=False ):
if isinstance(path, str):
@ -807,47 +802,30 @@ def create_dataset_metadata( skip_existing=True ):
if id not in metadata:
metadata[id] = {}
# audio
utterance_metadata = {}
if audios:
if _get_quant_extension() == ".dac":
dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
# ideally we'll encode Encodec-based audio in a similar manner because np has smaller files than pt
dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
duration = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
metadata[id]["metadata"] = {
"original_length": dac["metadata"]["original_length"],
"sample_rate": dac["metadata"]["sample_rate"],
}
else:
qnt = torch.load(f'{root}/{name}/{id}{_get_quant_extension()}')[0].t()
duration = qnt.shape[0] / cfg.dataset.frames_per_second
metadata[id]["duration"] = duration
else:
metadata[id]["duration"] = 0
if "text" in dac["metadata"]:
utterance_metadata["text"] = dac["metadata"]["text"]
if "phonemes" in dac["metadata"]:
utterance_metadata["phonemes"] = dac["metadata"]["phonemes"]
if "language" in dac["metadata"]:
utterance_metadata["language"] = dac["metadata"]["language"]
if "original_length" in dac["metadata"] and "sample_rate" in dac["metadata"]:
utterance_metadata["duration"] = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
# text
if texts:
if _get_phone_extension() == ".json":
json_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
content = json_metadata["phonemes"]
txt = json_metadata["text"]
lang = json_metadata["language"][:2]
else:
content = open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read().split(" ")
txt = ""
lang = "en"
if not utterance_metadata:
utterance_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
phn = cfg.tokenizer.encode("".join(content))
phn = np.array(phn).astype(np.uint8)
for k, v in utterance_metadata.items():
metadata[id][k] = v
metadata[id]["phones"] = len(phn)
metadata[id]["transcription"] = txt
metadata[id]["language"] = lang
except Exception as e:
#raise e
print(id, e)
#pass
tqdm.write(f'Error while processing {id}: {e}')
with open(str(metadata_path), "w", encoding="utf-8") as f:
f.write( json.dumps( metadata ) )
@ -900,84 +878,68 @@ def create_dataset_hdf5( skip_existing=True ):
for id in tqdm(ids, desc=f"Processing {name}"):
try:
audio_exists = os.path.exists(f'{root}/{name}/{id}{_get_quant_extension()}') if audios else True
text_exists = os.path.exists(f'{root}/{name}/{id}{_get_phone_extension()}') if texts else True
audio_exists = os.path.exists(f'{root}/{name}/{id}{_get_quant_extension()}')
text_exists = os.path.exists(f'{root}/{name}/{id}{_get_phone_extension()}') if type != "Noise" else True
if not audio_exists or not text_exists:
if not audio_exists:
continue
key = f'{type}/{speaker_name}/{id}'
"""
if skip_existing and key in hf:
continue
"""
group = hf.create_group(key) if key not in hf else hf[key]
"""
group.attrs['id'] = id
group.attrs['type'] = type
group.attrs['speaker'] = speaker_name
"""
if id not in metadata:
metadata[id] = {}
utterance_metadata = {}
# audio
if audios:
if _get_quant_extension() == ".dac":
dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
# ideally we'll encode Encodec-based audio in a similar manner because np has smaller files than pt
dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
duration = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
metadata[id]["metadata"] = {
"original_length": dac["metadata"]["original_length"],
"sample_rate": dac["metadata"]["sample_rate"],
}
else:
qnt = torch.load(f'{root}/{name}/{id}{_get_quant_extension()}')[0].t()
duration = qnt.shape[0] / cfg.dataset.frames_per_second
qnt = qnt.numpy().astype(np.int16)
if "text" in dac["metadata"]:
utterance_metadata["text"] = dac["metadata"]["text"]
if "phonemes" in dac["metadata"]:
utterance_metadata["phonemes"] = dac["metadata"]["phonemes"]
if "language" in dac["metadata"]:
utterance_metadata["language"] = dac["metadata"]["language"]
if "original_length" in dac["metadata"] and "sample_rate" in dac["metadata"]:
utterance_metadata["duration"] = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
if "audio" not in group:
group.create_dataset('audio', data=qnt, compression='lzf')
group.create_dataset('audio', data=qnt.numpy().astype(np.int16), compression='lzf')
group.attrs['duration'] = duration
metadata[id]["duration"] = duration
else:
group.attrs['duration'] = 0
metadata[id]["duration"] = 0
# text
if texts:
if _get_phone_extension() == ".json":
json_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
content = json_metadata["phonemes"]
txt = json_metadata["text"]
lang = json_metadata["language"][:2]
else:
content = open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read().split(" ")
txt = ""
lang = "en"
if not utterance_metadata and text_exists:
utterance_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
phn = cfg.tokenizer.encode("".join(content))
phn = "".join(utterance_metadata["phonemes"])
phn = cfg.tokenizer.encode(phn)
phn = np.array(phn).astype(np.uint8)
if "text" not in group:
group.create_dataset('text', data=phn, compression='lzf')
group.attrs['phonemes'] = len(phn)
group.attrs['transcription'] = txt
group.attrs['language'] = lang
for k, v in utterance_metadata.items():
group.attrs[k] = v
metadata[id][k] = v
metadata[id]["phones"] = len(phn)
metadata[id]["transcription"] = txt
metadata[id]["language"] = lang
else:
group.attrs['phonemes'] = 0
metadata[id]["phones"] = 0
except Exception as e:
#raise e
print(id, e)
#pass
tqdm.write(f'Error while processing {id}: {e}')
with open(str(metadata_path), "w", encoding="utf-8") as f:
f.write( json.dumps( metadata ) )
@ -1002,119 +964,6 @@ def create_dataset_hdf5( skip_existing=True ):
hf.create_dataset('symmap', data=json.dumps(symmap))
hf.close()
def extract_dataset_hdf5( skip_existing=True ):
cfg.dataset.use_hdf5 = True
cfg.load_hdf5(write=False)
hf = cfg.hdf5
symmap = get_phone_symmap()
reverse_symmap = {"1":"<s>","2":"</s>","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"","12":"","13":"d","14":"ɹ","15":"tˈ","16":"","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"","75":"kˈ","76":"ɔ","77":"zˈ","78":"","79":"","80":"vˈ","81":"","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"","86":"pˈ","87":"ðˌ","88":"","89":"","90":"","91":"̩","92":"ʔ","93":"","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"","101":"uːˈ","102":"iːˈ","103":"","104":".ˈ","105":"","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"","127":"ɫ","128":"q","129":"","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":"","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"","150":"","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"","180":"","181":"“ˈ","182":"“ˌ","183":";ˈ","184":"","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"","221":"eˈ","222":"ʍ","223":"","224":"","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"}
root = str(cfg.data_dir)
def add( type="training", audios=True, texts=True ):
for group in tqdm( hf[f'{type}/data/'].keys(), desc=f"Processing {type}"):
for name in tqdm( hf[f'{type}/data/{group}'].keys(), desc=f"Processing {group}"):
(cfg.data_dir / group / name).mkdir(parents=True, exist_ok=True)
for id in tqdm( hf[f'{type}/data/{group}/{name}'].keys(), desc=f"Processing {name}"):
try:
key = f'{type}/data/{group}/{name}/{id}'
if key not in hf:
tqdm.write(f'Missing key: {key}')
continue
audio_exists = "audio" in hf[key]
text_exists = "text" in hf[key]
if not audio_exists or not text_exists:
tqdm.write(f'Missing audio/text: {key}')
continue
audio_path = Path(f'{root}/{group}/{name}/{id}.enc')
text_path = Path(f'{root}/{group}/{name}/{id}.json')
# audio
if audios and audio_exists and not audio_path.exists():
qnt = hf[key]["audio"][:, :]
torch.save( qnt, audio_path )
# text
if texts and text_exists and not text_path.exists():
tokens = hf[key]["text"][:][1:-1]
phones = [ reverse_symmap[f'{token}'] for token in tokens ]
phones = list("".join(phones).replace(" ", " "))
j = {
"text": "",
"phonemes": phones,
"language": "en"
}
with open(text_path, "w", encoding="utf-8") as f:
f.write( json.dumps( j ) )
except Exception as e:
raise e
add( type="training" )
add( type="validation" )
add( type="noise", texts=False )
hf.close()
def retokenize_dataset_hdf5( skip_existing=True ):
cfg.dataset.use_hdf5 = True
cfg.load_hdf5(write=True)
hf = cfg.hdf5
symmap = get_phone_symmap()
reverse_symmap = {"1":"<s>","2":"</s>","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"","12":"","13":"d","14":"ɹ","15":"tˈ","16":"","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"","75":"kˈ","76":"ɔ","77":"zˈ","78":"","79":"","80":"vˈ","81":"","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"","86":"pˈ","87":"ðˌ","88":"","89":"","90":"","91":"̩","92":"ʔ","93":"","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"","101":"uːˈ","102":"iːˈ","103":"","104":".ˈ","105":"","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"","127":"ɫ","128":"q","129":"","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":"","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"","150":"","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"","180":"","181":"“ˈ","182":"“ˌ","183":";ˈ","184":"","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"","221":"eˈ","222":"ʍ","223":"","224":"","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"}
root = str(cfg.data_dir)
def add( type="training" ):
for group in tqdm( hf[f'{type}/data/'].keys(), desc=f"Processing {type}"):
for name in tqdm( hf[f'{type}/data/{group}'].keys(), desc=f"Processing {group}"):
(cfg.data_dir / group / name).mkdir(parents=True, exist_ok=True)
for id in tqdm( hf[f'{type}/data/{group}/{name}'].keys(), desc=f"Processing {name}"):
try:
key = f'{type}/data/{group}/{name}/{id}'
if key not in hf:
tqdm.write(f'Missing key: {key}')
continue
if "text" not in hf[key]:
tqdm.write(f'Missing text: {key}')
continue
# text
tokens = hf[key]["text"][:][1:-1]
content = list("".join([ reverse_symmap[f'{token}'] for token in tokens ]).replace(" ", " "))
tokens = cfg.tokenizer.encode("".join(content))
tokens = np.array(tokens).astype(np.uint8)
del hf[key]['text']
hf[key].create_dataset('text', data=tokens, compression='lzf')
except Exception as e:
raise e
add( type="training" )
add( type="validation" )
# write symmap
if "symmap" in hf:
del hf['symmap']
hf.create_dataset('symmap', data=json.dumps(symmap))
hf.close()
if __name__ == "__main__":
import argparse
@ -1135,10 +984,6 @@ if __name__ == "__main__":
if args.action == "hdf5":
create_dataset_hdf5()
if args.action == "extract-hdf5":
extract_dataset_hdf5()
if args.action == "retokenize-hdf5":
retokenize_dataset_hdf5()
elif args.action == "list-dataset":
dataset = []
for group in os.listdir(cfg.data_dir):
@ -1147,7 +992,7 @@ if __name__ == "__main__":
continue
dataset.append(f'{group}/{name}')
print(dataset)
print(json.dumps(dataset))
elif args.action == "metadata":
create_dataset_metadata()
elif args.action == "sample":