final tweaks, hopefully, again

This commit is contained in:
mrq 2024-05-15 23:04:19 -05:00
parent 8d79f78e0a
commit d9aabfa3ae
4 changed files with 450 additions and 570 deletions

View File

@ -41,9 +41,7 @@
} }
], ],
"normalizer": null, "normalizer": null,
"pre_tokenizer": { "pre_tokenizer": null,
"type": "Whitespace"
},
"post_processor": { "post_processor": {
"type": "TemplateProcessing", "type": "TemplateProcessing",
"single": [ "single": [
@ -110,263 +108,264 @@
"end_of_word_suffix": null, "end_of_word_suffix": null,
"fuse_unk": false, "fuse_unk": false,
"byte_fallback": false, "byte_fallback": false,
"ignore_merges": false,
"vocab": { "vocab": {
"<unk>": 0, "<unk>": 0,
"<bos>": 1, "<bos>": 1,
"</eos>": 2, "</eos>": 2,
"<mask>": 3, "<mask>": 3,
"!": 4, " ": 4,
"\"": 5, "!": 5,
"(": 6, "\"": 6,
")": 7, "(": 7,
",": 8, ")": 8,
"-": 9, ",": 9,
".": 10, "-": 10,
"1": 11, ".": 11,
":": 12, "1": 12,
"?": 13, ":": 13,
"a": 14, ";": 14,
"b": 15, "?": 15,
"c": 16, "a": 16,
"d": 17, "b": 17,
"e": 18, "c": 18,
"f": 19, "d": 19,
"h": 20, "e": 20,
"i": 21, "f": 21,
"j": 22, "h": 22,
"k": 23, "i": 23,
"l": 24, "j": 24,
"m": 25, "k": 25,
"n": 26, "l": 26,
"o": 27, "m": 27,
"p": 28, "n": 28,
"q": 29, "o": 29,
"r": 30, "p": 30,
"s": 31, "q": 31,
"t": 32, "r": 32,
"u": 33, "s": 33,
"v": 34, "t": 34,
"w": 35, "u": 35,
"x": 36, "v": 36,
"z": 37, "w": 37,
"¡": 38, "x": 38,
"«": 39, "z": 39,
"»": 40, "¡": 40,
"¿": 41, "«": 41,
"æ": 42, "»": 42,
"ç": 43, "¿": 43,
"ð": 44, "æ": 44,
"ŋ": 45, "ç": 45,
"ɐ": 46, "ð": 46,
"ɑ": 47, "ŋ": 47,
"ɔ": 48, "ɐ": 48,
"ɕ": 49, "ɑ": 49,
"ə": 50, "ɔ": 50,
"ɚ": 51, "ɕ": 51,
"ɛ": 52, "ə": 52,
"ɜ": 53, "ɚ": 53,
"ɟ": 54, "ɛ": 54,
"ɡ": 55, "ɜ": 55,
"ɪ": 56, "ɟ": 56,
"ɬ": 57, "ɡ": 57,
"ɯ": 58, "ɪ": 58,
"ɹ": 59, "ɬ": 59,
"ɾ": 60, "ɯ": 60,
"ʃ": 61, "ɹ": 61,
"ʈ": 62, "ɾ": 62,
"ʊ": 63, "ʃ": 63,
"ʋ": 64, "ʈ": 64,
"ʌ": 65, "ʊ": 65,
"ʑ": 66, "ʋ": 66,
"ʒ": 67, "ʌ": 67,
"ʔ": 68, "ʑ": 68,
"ʲ": 69, "ʒ": 69,
"ˈ": 70, "ʔ": 70,
"ˌ": 71, "ʲ": 71,
"ː": 72, "ˈ": 72,
"̃": 73, "ˌ": 73,
"̩": 74, "ː": 74,
"θ": 75, "̃": 75,
"": 76, "̩": 76,
"": 77, "θ": 77,
"": 78, "": 78,
"ˈɛ": 79, "": 79,
"iː": 80, "": 80,
"aɪ": 81, "": 81,
"nd": 82, "": 82,
"ˈɪ": 83, "ˈɛ": 83,
"eɪ": 84, "iː": 84,
"ˈæ": 85, "aɪ": 85,
"": 86, "nd": 86,
"ðə": 87, "ˈɪ": 87,
"ɑː": 88, "eɪ": 88,
"ˈeɪ": 89, "ˈæ": 89,
"ən": 90, "ðə": 90,
"uː": 91, "": 91,
"ˈʌ": 92, "ɑː": 92,
"ˈaɪ": 93, "ˈeɪ": 93,
"st": 94, "ən": 94,
"ˈɔ": 95, "uː": 95,
"ˈ": 96, "ˈʌ": 96,
"ˈiː": 97, "ˈaɪ": 97,
"ˈɑː": 98, "st": 98,
"ænd": 99, "ˈɔ": 99,
"ːɹ": 100, "ˈ": 100,
"ɪŋ": 101, "ˈiː": 101,
"ɜː": 102, "ˈɑː": 102,
"ɪn": 103, "ænd": 103,
"": 104, "ːɹ": 104,
"ʌv": 105, "ɪŋ": 105,
"": 106, "ɜː": 106,
"əl": 107, "ɪn": 107,
"ˈuː": 108, "": 108,
"": 109, "ʌv": 109,
"ɪz": 110, "": 110,
"ˈɜː": 111, "əl": 111,
"ˌʌ": 112, "ˈuː": 112,
"æt": 113, "tʃ": 113,
"": 114, "ɪz": 114,
"ˈɔː": 115, "ˈɜː": 115,
"ɪt": 116, "ˌʌ": 116,
"ˈ": 117, "æt": 117,
"ɚɹ": 118, "": 118,
"ˈɛn": 119, "ˈɔː": 119,
"": 120, "ɪt": 120,
"li": 121, "ˈ": 121,
"hiː": 122, "ɚɹ": 122,
"ˌɛ": 123, "ˈɛn": 123,
"wɪ": 124, "wʌ": 124,
"ðæt": 125, "li": 125,
"wʌz": 126, "hiː": 126,
"juː": 127, "ˌɛ": 127,
"oːɹ": 128, "wɪ": 128,
"ðɪ": 129, "wʌz": 129,
"sˈɛ": 130, "ðæt": 130,
"ˈɑːɹ": 131, "juː": 131,
"ˌɪ": 132, "oːɹ": 132,
"nt": 133, "ðɪ": 133,
"ˈʊ": 134, "sˈɛ": 134,
"ənt": 135, "ˌɪ": 135,
"hɪz": 136, "ˈɑːɹ": 136,
"": 137, "nt": 137,
"ˌɑː": 138, "ˈʊ": 138,
"ɔːɹ": 139, "ənt": 139,
"ˈɛɹ": 140, "hɪz": 140,
"wɪð": 141, "ˌɑː": 141,
"ᵻd": 142, "": 142,
"ˈoːɹ": 143, "ɔːɹ": 143,
"pɹ": 144, "ˈɛɹ": 144,
"ˈɔːl": 145, "wɪð": 145,
"": 146, "ᵻd": 146,
"ʃən": 147, "ˈoːɹ": 147,
"kt": 148, "": 148,
"ˌoʊ": 149, "ˈɔːl": 149,
"ˈɔːɹ": 150, "": 150,
"": 151, "ʃən": 151,
"æz": 152, "kt": 152,
"ʃiː": 153, "ˌoʊ": 153,
"ˌʌt": 154, "ˈɔːɹ": 154,
"ˈɛl": 155, "": 155,
"ˌaʊ": 156, "æz": 156,
"ˈʌn": 157, "ˌʌt": 157,
"əs": 158, "ʃiː": 158,
"ː": 159, "ˈɛl": 159,
"lˈaɪ": 160, "ˌaʊ": 160,
"ˈæn": 161, "ˈʌn": 161,
"ˈɪɹ": 162, "əs": 162,
"ʊd": 163, "ː": 163,
"ɹᵻ": 164, "lˈaɪ": 164,
"ld": 165, "ˈæn": 165,
"bˌʌt": 166, "ˈɪɹ": 166,
"ks": 167, "ʊd": 167,
"nˈ": 168, "ɹᵻ": 168,
"ɾɚ": 169, "ld": 169,
"hæd": 170, "bˌʌt": 170,
"ɛɹ": 171, "ks": 171,
"ˈɪŋ": 172, "nˈ": 172,
"ɡɹ": 173, "hæd": 173,
"ɔn": 174, "ɾɚ": 174,
"ɑː": 175, "ɛɹ": 175,
"maɪ": 176, "ˈɪŋ": 176,
"": 177, "ɡɹ": 177,
"ːɹ": 178, "ɑː": 178,
"ðɚ": 179, "ɔn": 179,
"": 180, "": 180,
"ðɛɹ": 181, "maɪ": 181,
"ˈʌm": 182, "ːɹ": 182,
"ɑːt": 183, "ðɚ": 183,
"tɹ": 184, "tʊ": 184,
"sˈiː": 185, "ðɛɹ": 185,
"ʌvðə": 186, "ɑːt": 186,
"mˈɪ": 187, "ˈʌm": 187,
"ˈæp": 188, "": 188,
"ˌɪm": 189, "sˈiː": 189,
"ɪk": 190, "ʌvðə": 190,
"sp": 191, "mˈɪ": 191,
"lˈeɪ": 192, "hˈæ": 192,
"hˌɪm": 193, "ˌɪm": 193,
"ɐn": 194, "lˈeɪ": 194,
"ðeɪ": 195, "ɪk": 195,
"lˈɪ": 196, "sp": 196,
"ɾi": 197, "ɪm": 197,
"": 198, "ɐn": 198,
"lˈɛ": 199, "ðeɪ": 199,
"": 200, "lˈɪ": 200,
"ˈɪl": 201, "ɾi": 201,
"jˈuː": 202, "lˈɛ": 202,
"ʌm": 203, "": 203,
"mˌiː": 204, "": 204,
"lˈæ": 205, "lˈæ": 205,
"ˌɪn": 206, "ˈɪl": 206,
"bᵻ": 207, "jˈuː": 207,
"wˈʌn": 208, "ʌm": 208,
"ˈɪn": 209, "mˌiː": 209,
"ˈoʊn": 210, "bᵻ": 210,
"biː": 211, "wˈʌn": 211,
"sˈɛd": 212, "ˌɪn": 212,
"ˈɛd": 213, "ˈɪn": 213,
"ˈaɪt": 214, "ˈoʊn": 214,
"fɹʌm": 215, "sˈɛd": 215,
"baɪ": 216, "biː": 216,
"ɪs": 217, "ˈɛd": 217,
"ɚz": 218, "ˈaɪt": 218,
"ðɪs": 219, "baɪ": 219,
"əns": 220, "fɹʌm": 220,
"ɪf": 221, "ɪs": 221,
"bəl": 222, "ɚz": 222,
"ˈænd": 223, "ðɪs": 223,
"ɪnðə": 224, "əns": 224,
"əm": 225, "bəl": 225,
"iːz": 226, "ɪf": 226,
"ˌuː": 227, "ɪnðə": 227,
"ᵻz": 228, "əm": 228,
"wˈeɪ": 229, "ᵻz": 229,
"ft": 230, "ˌuː": 230,
"wiː": 231, "wˈeɪ": 231,
"lˈiː": 232, "ft": 232,
"stɹ": 233, "wiː": 233,
"": 234, "stɹ": 234,
"ɚd": 235, "lˈiː": 235,
"ˌaɪ": 236, "iːz": 236,
"kw": 237, "pt": 237,
"ˌɔn": 238, "": 238,
"ˈaɪd": 239, "ɚd": 239,
"ts": 240, "ˌaɪ": 240,
"ɪm": 241, "kw": 241,
"ˈʌst": 242, "ˌɔn": 242,
"ˈoʊld": 243, "ˈaɪd": 243,
"ˌɪ": 244, "ɪm": 244,
"dˈɪ": 245, "ˈʌst": 245,
"sˌoʊ": 246, "ˈoʊld": 246,
"ɑːɹ": 247, "ts": 247,
"": 248, "ˌɪ": 248,
"sˈeɪ": 249, "sˌoʊ": 249,
"ɾᵻd": 250, "dˈɪ": 250,
"dᵻ": 251, "ɑːɹ": 251,
"ɪ": 252, "": 252,
"sˈɛl": 253, "sˈeɪ": 253,
"ɹi": 254, "ɾᵻd": 254,
"ˈʌðɚ": 255 "ɪ": 255
}, },
"merges": [ "merges": [
"ˈ ɛ", "ˈ ɛ",
@ -376,8 +375,8 @@
"ˈ ɪ", "ˈ ɪ",
"e ɪ", "e ɪ",
"ˈ æ", "ˈ æ",
"o ʊ",
"ð ə", "ð ə",
"o ʊ",
"ɑ ː", "ɑ ː",
"ˈ eɪ", "ˈ eɪ",
"ə n", "ə n",
@ -415,20 +414,20 @@
"h iː", "h iː",
"ˌ ɛ", "ˌ ɛ",
"w ɪ", "w ɪ",
"ð æt",
"wʌ z", "wʌ z",
"ð æt",
"j uː", "j uː",
"o ːɹ", "o ːɹ",
ɪ", ɪ",
"s ˈɛ", "s ˈɛ",
"ˈɑː ɹ",
ɪ", ɪ",
"ˈɑː ɹ",
"n t", "n t",
"ˈ ʊ", "ˈ ʊ",
"ən t", "ən t",
"h ɪz", "h ɪz",
"h æ",
ɑː", ɑː",
"h æ",
"ɔ ːɹ", "ɔ ːɹ",
"ˈɛ ɹ", "ˈɛ ɹ",
"wɪ ð", "wɪ ð",
@ -443,8 +442,8 @@
"ˈɔ ːɹ", "ˈɔ ːɹ",
"f ɹ", "f ɹ",
"æ z", "æ z",
"ʃ iː",
"ˌʌ t", "ˌʌ t",
"ʃ iː",
"ˈɛ l", "ˈɛ l",
"ˌ aʊ", "ˌ aʊ",
"ˈʌ n", "ˈʌ n",
@ -459,93 +458,89 @@
"b ˌʌt", "b ˌʌt",
"k s", "k s",
"n ˈoʊ", "n ˈoʊ",
"ɾ ɚ",
"hæ d", "hæ d",
"ɾ ɚ",
"ɛ ɹ", "ɛ ɹ",
"ˈɪ ŋ", "ˈɪ ŋ",
"ɡ ɹ", "ɡ ɹ",
"ɔ n",
"n ˌɑː", "n ˌɑː",
"m aɪ", "ɔ n",
"v ɚ", "v ɚ",
"m aɪ",
"f ɔːɹ", "f ɔːɹ",
"ð ɚ", "ð ɚ",
"t ʊ", "t ʊ",
"ð ɛɹ", "ð ɛɹ",
"ˈʌ m",
"nˌɑː t", "nˌɑː t",
"ˈʌ m",
"t ɹ", "t ɹ",
"s ˈiː", "s ˈiː",
"ʌv ðə", "ʌv ðə",
"m ˈɪ", "m ˈɪ",
"ˈæ p", "h ˈæ",
"ˌɪ m", "ˌɪ m",
"l ˈeɪ",
"ɪ k", "ɪ k",
"s p", "s p",
"l ˈeɪ",
"h ˌɪm", "h ˌɪm",
"ɐ n", "ɐ n",
"ð eɪ", "ð eɪ",
"l ˈɪ", "l ˈɪ",
"ɾ i", "ɾ i",
"b ɹ",
"l ˈɛ", "l ˈɛ",
"b ɹ",
"k ɹ", "k ɹ",
"l ˈæ",
"ˈɪ l", "ˈɪ l",
"j ˈuː", "j ˈuː",
"ʌ m", "ʌ m",
"mˌ iː", "mˌ iː",
"l ˈæ",
ɪn",
"b ᵻ", "b ᵻ",
"w ˈʌn", "w ˈʌn",
ɪn",
"ˈɪ n", "ˈɪ n",
"ˈoʊ n", "ˈoʊ n",
"b iː",
"sˈɛ d", "sˈɛ d",
"b iː",
"ˈɛ d", "ˈɛ d",
"ˈaɪ t", "ˈaɪ t",
"fɹ ʌm",
"b aɪ", "b aɪ",
"fɹ ʌm",
"ɪ s", "ɪ s",
"ɚ z", "ɚ z",
"ðɪ s", "ðɪ s",
"ən s", "ən s",
"ɪ f",
"b əl", "b əl",
"ˈæ nd", "ɪ f",
"ɪn ðə", "ɪn ðə",
"ə m", "ə m",
"iː z",
"ˌ uː",
"ᵻ z", "ᵻ z",
"ˌ uː",
"w ˈeɪ", "w ˈeɪ",
"f t", "f t",
"w iː", "w iː",
"l ˈiː",
"st ɹ", "st ɹ",
"l ˈiː",
"iː z",
"p t",
"j ʊ", "j ʊ",
"ɚ d", "ɚ d",
"ˌ aɪ", "ˌ aɪ",
"k w", "k w",
"ˌ ɔn", "ˌ ɔn",
"ˈaɪ d", "ˈaɪ d",
"t s",
"ɪ m", "ɪ m",
"ˈʌ st", "ˈʌ st",
"ˈoʊ ld", "ˈoʊ ld",
"t s",
"ˌɪ tʃ", "ˌɪ tʃ",
"d ˈɪ",
"s ˌoʊ", "s ˌoʊ",
"d ˈɪ",
"ɑː ɹ", "ɑː ɹ",
"h ɐ", "h ɐ",
"s ˈeɪ", "s ˈeɪ",
"ɾ ᵻd", "ɾ ᵻd",
"d ᵻ", "w ˌɪtʃ"
"w ˌɪtʃ",
"sˈɛ l",
"ɹ i",
"ˈʌ ðɚ"
] ]
} }
} }

View File

@ -6,16 +6,22 @@ import torchaudio
from tqdm.auto import tqdm from tqdm.auto import tqdm
from pathlib import Path from pathlib import Path
from vall_e.config import cfg from vall_e.config import cfg
# things that could be args
cfg.sample_rate = 44_000
cfg.inference.audio_backend = "dac"
"""
cfg.inference.weight_dtype = "bfloat16"
cfg.inference.dtype = torch.bfloat16
cfg.inference.amp = True
"""
from vall_e.emb.g2p import encode as valle_phonemize from vall_e.emb.g2p import encode as valle_phonemize
from vall_e.emb.qnt import encode as valle_quantize, _replace_file_extension from vall_e.emb.qnt import encode as valle_quantize, _replace_file_extension
# things that could be args
cfg.sample_rate = 24_000
cfg.inference.audio_backend = "encodec"
input_audio = "voices" input_audio = "voices"
input_metadata = "./training/metadata" input_metadata = "metadata"
output_dataset = f"./training/data-{'2' if cfg.sample_rate == 24_000 else '4'}4KHz-{cfg.inference.audio_backend}" output_dataset = f"training-{'2' if cfg.sample_rate == 24_000 else '4'}4KHz-{cfg.inference.audio_backend}"
device = "cuda" device = "cuda"
audio_extension = ".dac" if cfg.inference.audio_backend == "dac" else ".enc" audio_extension = ".dac" if cfg.inference.audio_backend == "dac" else ".enc"
@ -34,9 +40,6 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
if not os.path.isdir(f'./{input_audio}/{dataset_name}/'): if not os.path.isdir(f'./{input_audio}/{dataset_name}/'):
print("Is not dir:", f'./{input_audio}/{dataset_name}/') print("Is not dir:", f'./{input_audio}/{dataset_name}/')
continue continue
if dataset_name in ["LibriVox", "Audiobooks"]:
continue
for speaker_id in tqdm(sorted(os.listdir(f'./{input_audio}/{dataset_name}/')), desc=f"Processing speaker in {dataset_name}"): for speaker_id in tqdm(sorted(os.listdir(f'./{input_audio}/{dataset_name}/')), desc=f"Processing speaker in {dataset_name}"):
if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'): if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'):
@ -55,10 +58,29 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
waveform, sample_rate = torchaudio.load(inpath) waveform, sample_rate = torchaudio.load(inpath)
qnt = valle_quantize(waveform, sr=sample_rate, device=device) qnt = valle_quantize(waveform, sr=sample_rate, device=device)
if cfg.inference.audio_backend == "dac": if cfg.inference.audio_backend == "dac":
qnt.save(_replace_file_extension(outpath, audio_extension)) np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
"codes": qnt.codes.numpy().astype(np.uint16),
"metadata": {
"original_length": qnt.original_length,
"sample_rate": qnt.sample_rate,
"input_db": qnt.input_db.numpy().astype(np.float32),
"chunk_length": qnt.chunk_length,
"channels": qnt.channels,
"padding": qnt.padding,
"dac_version": "1.0.0",
},
})
else: else:
torch.save( qnt, _replace_file_extension(outpath, audio_extension) ) np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
"codes": qnt.numpy().astype(np.uint16),
"metadata": {
"original_length": waveform.shape[-1],
"sample_rate": sample_rate,
},
})
continue continue
@ -91,7 +113,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
fname = filename.replace(f'.{extension}', "") fname = filename.replace(f'.{extension}', "")
waveform, sample_rate = None, None waveform, sample_rate = None, None
language = metadata[filename]["language"] if "language" in metadata[filename] else "english" language = metadata[filename]["language"] if "language" in metadata[filename] else "en"
if len(metadata[filename]["segments"]) == 0 or not use_slices: if len(metadata[filename]["segments"]) == 0 or not use_slices:
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}.{extension}') outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}.{extension}')
@ -100,86 +122,101 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
if len(text) == 0: if len(text) == 0:
continue continue
if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists(): if _replace_file_extension(outpath, audio_extension).exists():
continue continue
if not _replace_file_extension(outpath, ".json").exists(): if waveform is None:
txts.append(( waveform, sample_rate = torchaudio.load(inpath)
outpath, if waveform.shape[0] > 1:
text, waveform = torch.mean(waveform, dim=0, keepdim=True)
language,
))
if not _replace_file_extension(outpath, audio_extension).exists():
if waveform is None:
waveform, sample_rate = torchaudio.load(inpath)
wavs.append(( wavs.append((
outpath, outpath,
waveform, text,
sample_rate language,
)) waveform,
sample_rate
))
else: else:
i = 0 i = 0
for segment in metadata[filename]["segments"]: for segment in metadata[filename]["segments"]:
id = pad(i, 4) id = pad(i, 4)
i = i + 1 i = i + 1
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}')
if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists(): outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}')
text = metadata[filename]["text"]
if len(text) == 0:
continue continue
if not _replace_file_extension(outpath, ".json").exists(): if _replace_file_extension(outpath, audio_extension).exists():
txts.append(( continue
outpath,
segment["text"],
language,
))
if not _replace_file_extension(outpath, audio_extension).exists():
if waveform is None:
waveform, sample_rate = torchaudio.load(inpath)
start = int(segment['start'] * sample_rate) if waveform is None:
end = int(segment['end'] * sample_rate) waveform, sample_rate = torchaudio.load(inpath)
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
if start < 0: start = int(segment['start'] * sample_rate)
start = 0 end = int(segment['end'] * sample_rate)
if end >= waveform.shape[-1]:
end = waveform.shape[-1] - 1
if end - start < 0: if start < 0:
continue start = 0
if end >= waveform.shape[-1]:
end = waveform.shape[-1] - 1
wavs.append(( if end - start < 0:
outpath, continue
waveform[:, start:end],
sample_rate
))
if len(txts) > 0: wavs.append((
for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}", disable=True): outpath,
outpath, text, language = job text,
phones = valle_phonemize(text) language,
data = { waveform[:, start:end],
"text": text.strip(), sample_rate
"phonemes": phones, ))
"language": language,
}
open(_replace_file_extension(outpath, ".json"), 'w', encoding='utf-8').write(json.dumps(data))
if len(wavs) > 0: if len(wavs) > 0:
for job in tqdm(wavs, desc=f"Quantizing: {speaker_id}"): for job in tqdm(wavs, desc=f"Quantizing: {speaker_id}"):
try: try:
outpath, waveform, sample_rate = job outpath, text, language, waveform, sample_rate = job
phones = valle_phonemize(text)
qnt = valle_quantize(waveform, sr=sample_rate, device=device) qnt = valle_quantize(waveform, sr=sample_rate, device=device)
if cfg.inference.audio_backend == "dac": if cfg.inference.audio_backend == "dac":
qnt.save(_replace_file_extension(outpath, audio_extension)) np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
"codes": qnt.codes.numpy().astype(np.uint16),
"metadata": {
"original_length": qnt.original_length,
"sample_rate": qnt.sample_rate,
"input_db": qnt.input_db.numpy().astype(np.float32),
"chunk_length": qnt.chunk_length,
"channels": qnt.channels,
"padding": qnt.padding,
"dac_version": "1.0.0",
"text": text.strip(),
"phonemes": "".join(phones),
"language": language,
},
})
else: else:
torch.save( qnt, _replace_file_extension(outpath, audio_extension) ) np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
"codes": qnt.numpy().astype(np.uint16),
"metadata": {
"original_length": waveform.shape[-1],
"sample_rate": sample_rate,
"text": text.strip(),
"phonemes": "".join(phones),
"language": language,
},
})
except Exception as e: except Exception as e:
print(f"Failed to quantize: {outpath}:", e) print(f"Failed to quantize: {outpath}:", e)
continue continue
open("./missing.json", 'w', encoding='utf-8').write(json.dumps(missing)) open("./missing.json", 'w', encoding='utf-8').write(json.dumps(missing))
open("./dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset)) open("./dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset))

View File

@ -38,20 +38,23 @@ else:
metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/{id}') metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/{id}')
metadata = json.loads(open(metadata_path, "r", encoding="utf-8").read()) metadata = json.loads(open(metadata_path, "r", encoding="utf-8").read())
if "phonemes" not in metadata:
continue
tokenizer_data.append( f'{"".join(metadata["phonemes"])}' ) tokenizer_data.append( f'{"".join(metadata["phonemes"])}' )
open(output_file, 'w', encoding='utf-8').write(json.dumps(tokenizer_data)) open(output_file, 'w', encoding='utf-8').write(json.dumps(tokenizer_data))
unk_token = "<unk>" unk_token = "<unk>"
spl_tokens = ["<bos>", "</eos>", unk_token, "<mask>"] spl_tokens = [unk_token, "<bos>", "</eos>", "<mask>", "<space>"]
trainer = BpeTrainer(special_tokens = spl_tokens, vocab_size = 256) trainer = BpeTrainer(special_tokens = spl_tokens, vocab_size = 256)
tokenizer = Tokenizer(BPE(unk_token = unk_token)) tokenizer = Tokenizer(BPE(unk_token = unk_token))
tokenizer.pre_tokenizer = Whitespace() tokenizer.pre_tokenizer = Whitespace() # takes 2 hours to process without this, we'll just manually add spaces as a token
tokenizer.post_processor = TemplateProcessing( tokenizer.post_processor = TemplateProcessing(
single="<bos> $A <eos>", single="<bos> $A <eos>",
special_tokens=[("<bos>", 1), ("<eos>", 2)], special_tokens=[("<bos>", 1), ("<eos>", 2)],
) )
tokenizer.train_from_iterator(tokenizer_data, trainer=trainer) tokenizer.train_from_iterator(tokenizer_data, trainer=trainer)
tokenizer.save("./training/tokenizer.json") tokenizer.save("./training/tokenizer_training_data.json")

View File

@ -86,19 +86,15 @@ def _calculate_durations( type="training" ):
def _load_paths(dataset, type="training"): def _load_paths(dataset, type="training"):
return { cfg.get_spkr( cfg.data_dir / data_dir / "dummy" ): _load_paths_from_metadata( data_dir, type=type, validate=cfg.dataset.validate and type == "training" ) for data_dir in tqdm(dataset, desc=f"Parsing dataset: {type}") } return { cfg.get_spkr( cfg.data_dir / data_dir / "dummy" ): _load_paths_from_metadata( data_dir, type=type, validate=cfg.dataset.validate and type == "training" ) for data_dir in tqdm(dataset, desc=f"Parsing dataset: {type}") }
def _load_paths_from_metadata(dataset_name, type="training", validate=False): def _load_paths_from_metadata(group_name, type="training", validate=False):
data_dir = dataset_name if cfg.dataset.use_hdf5 else cfg.data_dir / dataset_name data_dir = group_name if cfg.dataset.use_hdf5 else cfg.data_dir / group_name
_fn = _get_hdf5_paths if cfg.dataset.use_hdf5 else _get_paths_of_extensions _fn = _get_hdf5_paths if cfg.dataset.use_hdf5 else _get_paths_of_extensions
def key( id ): def key( id, entry=None ):
if not cfg.dataset.use_hdf5: return f"/{type}/{_get_hdf5_path(data_dir)}/{id}" if cfg.dataset.use_hdf5 else data_dir / id
return data_dir / id
return f"/{type}/{_get_hdf5_path(data_dir)}/{id}" metadata_path = cfg.metadata_dir / f'{group_name}.json'
metadata_path = cfg.metadata_dir / f'{dataset_name}.json'
metadata = {} metadata = {}
if cfg.dataset.use_metadata and metadata_path.exists(): if cfg.dataset.use_metadata and metadata_path.exists():
@ -107,10 +103,7 @@ def _load_paths_from_metadata(dataset_name, type="training", validate=False):
if len(metadata) == 0: if len(metadata) == 0:
return _fn( data_dir, type if cfg.dataset.use_hdf5 else _get_quant_extension(), validate ) return _fn( data_dir, type if cfg.dataset.use_hdf5 else _get_quant_extension(), validate )
def _validate( id, entry ):
def _validate( id ):
entry = metadata[id]
phones = entry['phones'] if "phones" in entry else 0 phones = entry['phones'] if "phones" in entry else 0
duration = entry['duration'] if "duration" in entry else 0 duration = entry['duration'] if "duration" in entry else 0
if type not in _total_durations: if type not in _total_durations:
@ -118,14 +111,16 @@ def _load_paths_from_metadata(dataset_name, type="training", validate=False):
_total_durations[type] += duration _total_durations[type] += duration
"""
if cfg.dataset.use_hdf5: if cfg.dataset.use_hdf5:
k = key( id ) k = key( id )
if k not in cfg.hdf5 or "audio" not in cfg.hdf5[k] or "text" not in cfg.hdf5[k]: if k not in cfg.hdf5 or "audio" not in cfg.hdf5[k] or "text" not in cfg.hdf5[k]:
return False return False
"""
return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration #and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
return [ key(id) for id in metadata.keys() if not validate or _validate(id) ] return [ key(id, entry) for id, entry in metadata.items() if not validate or _validate(id, entry) ]
def _get_hdf5_path(path): def _get_hdf5_path(path):
@ -136,16 +131,16 @@ def _get_hdf5_path(path):
def _get_hdf5_paths( data_dir, type="training", validate=False ): def _get_hdf5_paths( data_dir, type="training", validate=False ):
data_dir = str(data_dir) data_dir = str(data_dir)
def _validate( child ): def _validate( id, entry ):
phones = child.attrs['phonemes'] phones = entry.attrs['phonemes']
duration = child.attrs['duration'] duration = entry.attrs['duration']
if type not in _total_durations: if type not in _total_durations:
_total_durations[type] = 0 _total_durations[type] = 0
_total_durations[type] += child.attrs['duration'] _total_durations[type] += entry.attrs['duration']
return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration #and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
key = f"/{type}/{_get_hdf5_path(data_dir)}" key = f"/{type}/{_get_hdf5_path(data_dir)}"
return [ Path(f"{key}/{child.attrs['id']}") for child in cfg.hdf5[key].values() if not validate or _validate(child) ] if key in cfg.hdf5 else [] return [ Path(f"{key}/{id}") for id, entry in cfg.hdf5[key].items() if not validate or _validate(id, entry) ] if key in cfg.hdf5 else []
def _get_paths_of_extensions( path, extensions=_get_quant_extension(), validate=False ): def _get_paths_of_extensions( path, extensions=_get_quant_extension(), validate=False ):
if isinstance(path, str): if isinstance(path, str):
@ -807,47 +802,30 @@ def create_dataset_metadata( skip_existing=True ):
if id not in metadata: if id not in metadata:
metadata[id] = {} metadata[id] = {}
# audio utterance_metadata = {}
if audios: if audios:
if _get_quant_extension() == ".dac": # ideally we'll encode Encodec-based audio in a similar manner because np has smaller files than pt
dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()] dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16) qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
duration = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"] if "text" in dac["metadata"]:
metadata[id]["metadata"] = { utterance_metadata["text"] = dac["metadata"]["text"]
"original_length": dac["metadata"]["original_length"], if "phonemes" in dac["metadata"]:
"sample_rate": dac["metadata"]["sample_rate"], utterance_metadata["phonemes"] = dac["metadata"]["phonemes"]
} if "language" in dac["metadata"]:
else: utterance_metadata["language"] = dac["metadata"]["language"]
qnt = torch.load(f'{root}/{name}/{id}{_get_quant_extension()}')[0].t() if "original_length" in dac["metadata"] and "sample_rate" in dac["metadata"]:
duration = qnt.shape[0] / cfg.dataset.frames_per_second utterance_metadata["duration"] = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
metadata[id]["duration"] = duration
else:
metadata[id]["duration"] = 0
# text # text
if texts: if texts:
if _get_phone_extension() == ".json": if not utterance_metadata:
json_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read()) utterance_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
content = json_metadata["phonemes"]
txt = json_metadata["text"]
lang = json_metadata["language"][:2]
else:
content = open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read().split(" ")
txt = ""
lang = "en"
phn = cfg.tokenizer.encode("".join(content)) for k, v in utterance_metadata.items():
phn = np.array(phn).astype(np.uint8) metadata[id][k] = v
metadata[id]["phones"] = len(phn)
metadata[id]["transcription"] = txt
metadata[id]["language"] = lang
except Exception as e: except Exception as e:
#raise e tqdm.write(f'Error while processing {id}: {e}')
print(id, e)
#pass
with open(str(metadata_path), "w", encoding="utf-8") as f: with open(str(metadata_path), "w", encoding="utf-8") as f:
f.write( json.dumps( metadata ) ) f.write( json.dumps( metadata ) )
@ -900,84 +878,68 @@ def create_dataset_hdf5( skip_existing=True ):
for id in tqdm(ids, desc=f"Processing {name}"): for id in tqdm(ids, desc=f"Processing {name}"):
try: try:
audio_exists = os.path.exists(f'{root}/{name}/{id}{_get_quant_extension()}') if audios else True audio_exists = os.path.exists(f'{root}/{name}/{id}{_get_quant_extension()}')
text_exists = os.path.exists(f'{root}/{name}/{id}{_get_phone_extension()}') if texts else True text_exists = os.path.exists(f'{root}/{name}/{id}{_get_phone_extension()}') if type != "Noise" else True
if not audio_exists or not text_exists: if not audio_exists:
continue continue
key = f'{type}/{speaker_name}/{id}' key = f'{type}/{speaker_name}/{id}'
"""
if skip_existing and key in hf: if skip_existing and key in hf:
continue continue
"""
group = hf.create_group(key) if key not in hf else hf[key] group = hf.create_group(key) if key not in hf else hf[key]
"""
group.attrs['id'] = id group.attrs['id'] = id
group.attrs['type'] = type group.attrs['type'] = type
group.attrs['speaker'] = speaker_name group.attrs['speaker'] = speaker_name
"""
if id not in metadata: if id not in metadata:
metadata[id] = {} metadata[id] = {}
utterance_metadata = {}
# audio # audio
if audios: if audios:
if _get_quant_extension() == ".dac": # ideally we'll encode Encodec-based audio in a similar manner because np has smaller files than pt
dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()] dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16) qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
duration = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"] if "text" in dac["metadata"]:
metadata[id]["metadata"] = { utterance_metadata["text"] = dac["metadata"]["text"]
"original_length": dac["metadata"]["original_length"], if "phonemes" in dac["metadata"]:
"sample_rate": dac["metadata"]["sample_rate"], utterance_metadata["phonemes"] = dac["metadata"]["phonemes"]
} if "language" in dac["metadata"]:
else: utterance_metadata["language"] = dac["metadata"]["language"]
qnt = torch.load(f'{root}/{name}/{id}{_get_quant_extension()}')[0].t() if "original_length" in dac["metadata"] and "sample_rate" in dac["metadata"]:
duration = qnt.shape[0] / cfg.dataset.frames_per_second utterance_metadata["duration"] = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
qnt = qnt.numpy().astype(np.int16)
if "audio" not in group: if "audio" not in group:
group.create_dataset('audio', data=qnt, compression='lzf') group.create_dataset('audio', data=qnt.numpy().astype(np.int16), compression='lzf')
group.attrs['duration'] = duration
metadata[id]["duration"] = duration
else:
group.attrs['duration'] = 0
metadata[id]["duration"] = 0
# text # text
if texts: if texts:
if _get_phone_extension() == ".json": if not utterance_metadata and text_exists:
json_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read()) utterance_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
content = json_metadata["phonemes"]
txt = json_metadata["text"]
lang = json_metadata["language"][:2]
else:
content = open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read().split(" ")
txt = ""
lang = "en"
phn = cfg.tokenizer.encode("".join(content)) phn = "".join(utterance_metadata["phonemes"])
phn = cfg.tokenizer.encode(phn)
phn = np.array(phn).astype(np.uint8) phn = np.array(phn).astype(np.uint8)
if "text" not in group: if "text" not in group:
group.create_dataset('text', data=phn, compression='lzf') group.create_dataset('text', data=phn, compression='lzf')
group.attrs['phonemes'] = len(phn) for k, v in utterance_metadata.items():
group.attrs['transcription'] = txt group.attrs[k] = v
group.attrs['language'] = lang metadata[id][k] = v
metadata[id]["phones"] = len(phn)
metadata[id]["transcription"] = txt
metadata[id]["language"] = lang
else:
group.attrs['phonemes'] = 0
metadata[id]["phones"] = 0
except Exception as e: except Exception as e:
#raise e tqdm.write(f'Error while processing {id}: {e}')
print(id, e)
#pass
with open(str(metadata_path), "w", encoding="utf-8") as f: with open(str(metadata_path), "w", encoding="utf-8") as f:
f.write( json.dumps( metadata ) ) f.write( json.dumps( metadata ) )
@ -1002,119 +964,6 @@ def create_dataset_hdf5( skip_existing=True ):
hf.create_dataset('symmap', data=json.dumps(symmap)) hf.create_dataset('symmap', data=json.dumps(symmap))
hf.close() hf.close()
def extract_dataset_hdf5( skip_existing=True ):
cfg.dataset.use_hdf5 = True
cfg.load_hdf5(write=False)
hf = cfg.hdf5
symmap = get_phone_symmap()
reverse_symmap = {"1":"<s>","2":"</s>","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"","12":"","13":"d","14":"ɹ","15":"tˈ","16":"","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"","75":"kˈ","76":"ɔ","77":"zˈ","78":"","79":"","80":"vˈ","81":"","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"","86":"pˈ","87":"ðˌ","88":"","89":"","90":"","91":"̩","92":"ʔ","93":"","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"","101":"uːˈ","102":"iːˈ","103":"","104":".ˈ","105":"","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"","127":"ɫ","128":"q","129":"","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":"","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"","150":"","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"","180":"","181":"“ˈ","182":"“ˌ","183":";ˈ","184":"","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"","221":"eˈ","222":"ʍ","223":"","224":"","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"}
root = str(cfg.data_dir)
def add( type="training", audios=True, texts=True ):
for group in tqdm( hf[f'{type}/data/'].keys(), desc=f"Processing {type}"):
for name in tqdm( hf[f'{type}/data/{group}'].keys(), desc=f"Processing {group}"):
(cfg.data_dir / group / name).mkdir(parents=True, exist_ok=True)
for id in tqdm( hf[f'{type}/data/{group}/{name}'].keys(), desc=f"Processing {name}"):
try:
key = f'{type}/data/{group}/{name}/{id}'
if key not in hf:
tqdm.write(f'Missing key: {key}')
continue
audio_exists = "audio" in hf[key]
text_exists = "text" in hf[key]
if not audio_exists or not text_exists:
tqdm.write(f'Missing audio/text: {key}')
continue
audio_path = Path(f'{root}/{group}/{name}/{id}.enc')
text_path = Path(f'{root}/{group}/{name}/{id}.json')
# audio
if audios and audio_exists and not audio_path.exists():
qnt = hf[key]["audio"][:, :]
torch.save( qnt, audio_path )
# text
if texts and text_exists and not text_path.exists():
tokens = hf[key]["text"][:][1:-1]
phones = [ reverse_symmap[f'{token}'] for token in tokens ]
phones = list("".join(phones).replace(" ", " "))
j = {
"text": "",
"phonemes": phones,
"language": "en"
}
with open(text_path, "w", encoding="utf-8") as f:
f.write( json.dumps( j ) )
except Exception as e:
raise e
add( type="training" )
add( type="validation" )
add( type="noise", texts=False )
hf.close()
def retokenize_dataset_hdf5( skip_existing=True ):
cfg.dataset.use_hdf5 = True
cfg.load_hdf5(write=True)
hf = cfg.hdf5
symmap = get_phone_symmap()
reverse_symmap = {"1":"<s>","2":"</s>","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"","12":"","13":"d","14":"ɹ","15":"tˈ","16":"","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"","75":"kˈ","76":"ɔ","77":"zˈ","78":"","79":"","80":"vˈ","81":"","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"","86":"pˈ","87":"ðˌ","88":"","89":"","90":"","91":"̩","92":"ʔ","93":"","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"","101":"uːˈ","102":"iːˈ","103":"","104":".ˈ","105":"","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"","127":"ɫ","128":"q","129":"","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":"","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"","150":"","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"","180":"","181":"“ˈ","182":"“ˌ","183":";ˈ","184":"","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"","221":"eˈ","222":"ʍ","223":"","224":"","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"}
root = str(cfg.data_dir)
def add( type="training" ):
for group in tqdm( hf[f'{type}/data/'].keys(), desc=f"Processing {type}"):
for name in tqdm( hf[f'{type}/data/{group}'].keys(), desc=f"Processing {group}"):
(cfg.data_dir / group / name).mkdir(parents=True, exist_ok=True)
for id in tqdm( hf[f'{type}/data/{group}/{name}'].keys(), desc=f"Processing {name}"):
try:
key = f'{type}/data/{group}/{name}/{id}'
if key not in hf:
tqdm.write(f'Missing key: {key}')
continue
if "text" not in hf[key]:
tqdm.write(f'Missing text: {key}')
continue
# text
tokens = hf[key]["text"][:][1:-1]
content = list("".join([ reverse_symmap[f'{token}'] for token in tokens ]).replace(" ", " "))
tokens = cfg.tokenizer.encode("".join(content))
tokens = np.array(tokens).astype(np.uint8)
del hf[key]['text']
hf[key].create_dataset('text', data=tokens, compression='lzf')
except Exception as e:
raise e
add( type="training" )
add( type="validation" )
# write symmap
if "symmap" in hf:
del hf['symmap']
hf.create_dataset('symmap', data=json.dumps(symmap))
hf.close()
if __name__ == "__main__": if __name__ == "__main__":
import argparse import argparse
@ -1135,10 +984,6 @@ if __name__ == "__main__":
if args.action == "hdf5": if args.action == "hdf5":
create_dataset_hdf5() create_dataset_hdf5()
if args.action == "extract-hdf5":
extract_dataset_hdf5()
if args.action == "retokenize-hdf5":
retokenize_dataset_hdf5()
elif args.action == "list-dataset": elif args.action == "list-dataset":
dataset = [] dataset = []
for group in os.listdir(cfg.data_dir): for group in os.listdir(cfg.data_dir):
@ -1147,7 +992,7 @@ if __name__ == "__main__":
continue continue
dataset.append(f'{group}/{name}') dataset.append(f'{group}/{name}')
print(dataset) print(json.dumps(dataset))
elif args.action == "metadata": elif args.action == "metadata":
create_dataset_metadata() create_dataset_metadata()
elif args.action == "sample": elif args.action == "sample":