final tweaks, hopefully, again
This commit is contained in:
parent
8d79f78e0a
commit
d9aabfa3ae
|
@ -41,9 +41,7 @@
|
|||
}
|
||||
],
|
||||
"normalizer": null,
|
||||
"pre_tokenizer": {
|
||||
"type": "Whitespace"
|
||||
},
|
||||
"pre_tokenizer": null,
|
||||
"post_processor": {
|
||||
"type": "TemplateProcessing",
|
||||
"single": [
|
||||
|
@ -110,263 +108,264 @@
|
|||
"end_of_word_suffix": null,
|
||||
"fuse_unk": false,
|
||||
"byte_fallback": false,
|
||||
"ignore_merges": false,
|
||||
"vocab": {
|
||||
"<unk>": 0,
|
||||
"<bos>": 1,
|
||||
"</eos>": 2,
|
||||
"<mask>": 3,
|
||||
"!": 4,
|
||||
"\"": 5,
|
||||
"(": 6,
|
||||
")": 7,
|
||||
",": 8,
|
||||
"-": 9,
|
||||
".": 10,
|
||||
"1": 11,
|
||||
":": 12,
|
||||
"?": 13,
|
||||
"a": 14,
|
||||
"b": 15,
|
||||
"c": 16,
|
||||
"d": 17,
|
||||
"e": 18,
|
||||
"f": 19,
|
||||
"h": 20,
|
||||
"i": 21,
|
||||
"j": 22,
|
||||
"k": 23,
|
||||
"l": 24,
|
||||
"m": 25,
|
||||
"n": 26,
|
||||
"o": 27,
|
||||
"p": 28,
|
||||
"q": 29,
|
||||
"r": 30,
|
||||
"s": 31,
|
||||
"t": 32,
|
||||
"u": 33,
|
||||
"v": 34,
|
||||
"w": 35,
|
||||
"x": 36,
|
||||
"z": 37,
|
||||
"¡": 38,
|
||||
"«": 39,
|
||||
"»": 40,
|
||||
"¿": 41,
|
||||
"æ": 42,
|
||||
"ç": 43,
|
||||
"ð": 44,
|
||||
"ŋ": 45,
|
||||
"ɐ": 46,
|
||||
"ɑ": 47,
|
||||
"ɔ": 48,
|
||||
"ɕ": 49,
|
||||
"ə": 50,
|
||||
"ɚ": 51,
|
||||
"ɛ": 52,
|
||||
"ɜ": 53,
|
||||
"ɟ": 54,
|
||||
"ɡ": 55,
|
||||
"ɪ": 56,
|
||||
"ɬ": 57,
|
||||
"ɯ": 58,
|
||||
"ɹ": 59,
|
||||
"ɾ": 60,
|
||||
"ʃ": 61,
|
||||
"ʈ": 62,
|
||||
"ʊ": 63,
|
||||
"ʋ": 64,
|
||||
"ʌ": 65,
|
||||
"ʑ": 66,
|
||||
"ʒ": 67,
|
||||
"ʔ": 68,
|
||||
"ʲ": 69,
|
||||
"ˈ": 70,
|
||||
"ˌ": 71,
|
||||
"ː": 72,
|
||||
"̃": 73,
|
||||
"̩": 74,
|
||||
"θ": 75,
|
||||
"ᵻ": 76,
|
||||
"—": 77,
|
||||
"…": 78,
|
||||
"ˈɛ": 79,
|
||||
"iː": 80,
|
||||
"aɪ": 81,
|
||||
"nd": 82,
|
||||
"ˈɪ": 83,
|
||||
"eɪ": 84,
|
||||
"ˈæ": 85,
|
||||
"oʊ": 86,
|
||||
"ðə": 87,
|
||||
"ɑː": 88,
|
||||
"ˈeɪ": 89,
|
||||
"ən": 90,
|
||||
"uː": 91,
|
||||
"ˈʌ": 92,
|
||||
"ˈaɪ": 93,
|
||||
"st": 94,
|
||||
"ˈɔ": 95,
|
||||
"ˈoʊ": 96,
|
||||
"ˈiː": 97,
|
||||
"ˈɑː": 98,
|
||||
"ænd": 99,
|
||||
"ːɹ": 100,
|
||||
"ɪŋ": 101,
|
||||
"ɜː": 102,
|
||||
"ɪn": 103,
|
||||
"tə": 104,
|
||||
"ʌv": 105,
|
||||
"aʊ": 106,
|
||||
"əl": 107,
|
||||
"ˈuː": 108,
|
||||
"tʃ": 109,
|
||||
"ɪz": 110,
|
||||
"ˈɜː": 111,
|
||||
"ˌʌ": 112,
|
||||
"æt": 113,
|
||||
"dʒ": 114,
|
||||
"ˈɔː": 115,
|
||||
"ɪt": 116,
|
||||
"ˈaʊ": 117,
|
||||
"ɚɹ": 118,
|
||||
"ˈɛn": 119,
|
||||
"wʌ": 120,
|
||||
"li": 121,
|
||||
"hiː": 122,
|
||||
"ˌɛ": 123,
|
||||
"wɪ": 124,
|
||||
"ðæt": 125,
|
||||
"wʌz": 126,
|
||||
"juː": 127,
|
||||
"oːɹ": 128,
|
||||
"ðɪ": 129,
|
||||
"sˈɛ": 130,
|
||||
"ˈɑːɹ": 131,
|
||||
"ˌɪ": 132,
|
||||
"nt": 133,
|
||||
"ˈʊ": 134,
|
||||
"ənt": 135,
|
||||
"hɪz": 136,
|
||||
"hæ": 137,
|
||||
"ˌɑː": 138,
|
||||
"ɔːɹ": 139,
|
||||
"ˈɛɹ": 140,
|
||||
"wɪð": 141,
|
||||
"ᵻd": 142,
|
||||
"ˈoːɹ": 143,
|
||||
"pɹ": 144,
|
||||
"ˈɔːl": 145,
|
||||
"mˌ": 146,
|
||||
"ʃən": 147,
|
||||
"kt": 148,
|
||||
"ˌoʊ": 149,
|
||||
"ˈɔːɹ": 150,
|
||||
"fɹ": 151,
|
||||
"æz": 152,
|
||||
"ʃiː": 153,
|
||||
"ˌʌt": 154,
|
||||
"ˈɛl": 155,
|
||||
"ˌaʊ": 156,
|
||||
"ˈʌn": 157,
|
||||
"əs": 158,
|
||||
"hɜː": 159,
|
||||
"lˈaɪ": 160,
|
||||
"ˈæn": 161,
|
||||
"ˈɪɹ": 162,
|
||||
"ʊd": 163,
|
||||
"ɹᵻ": 164,
|
||||
"ld": 165,
|
||||
"bˌʌt": 166,
|
||||
"ks": 167,
|
||||
"nˈoʊ": 168,
|
||||
"ɾɚ": 169,
|
||||
"hæd": 170,
|
||||
"ɛɹ": 171,
|
||||
"ˈɪŋ": 172,
|
||||
"ɡɹ": 173,
|
||||
"ɔn": 174,
|
||||
"nˌɑː": 175,
|
||||
"maɪ": 176,
|
||||
"vɚ": 177,
|
||||
"fɔːɹ": 178,
|
||||
"ðɚ": 179,
|
||||
"tʊ": 180,
|
||||
"ðɛɹ": 181,
|
||||
"ˈʌm": 182,
|
||||
"nˌɑːt": 183,
|
||||
"tɹ": 184,
|
||||
"sˈiː": 185,
|
||||
"ʌvðə": 186,
|
||||
"mˈɪ": 187,
|
||||
"ˈæp": 188,
|
||||
"ˌɪm": 189,
|
||||
"ɪk": 190,
|
||||
"sp": 191,
|
||||
"lˈeɪ": 192,
|
||||
"hˌɪm": 193,
|
||||
"ɐn": 194,
|
||||
"ðeɪ": 195,
|
||||
"lˈɪ": 196,
|
||||
"ɾi": 197,
|
||||
"bɹ": 198,
|
||||
"lˈɛ": 199,
|
||||
"kɹ": 200,
|
||||
"ˈɪl": 201,
|
||||
"jˈuː": 202,
|
||||
"ʌm": 203,
|
||||
"mˌiː": 204,
|
||||
" ": 4,
|
||||
"!": 5,
|
||||
"\"": 6,
|
||||
"(": 7,
|
||||
")": 8,
|
||||
",": 9,
|
||||
"-": 10,
|
||||
".": 11,
|
||||
"1": 12,
|
||||
":": 13,
|
||||
";": 14,
|
||||
"?": 15,
|
||||
"a": 16,
|
||||
"b": 17,
|
||||
"c": 18,
|
||||
"d": 19,
|
||||
"e": 20,
|
||||
"f": 21,
|
||||
"h": 22,
|
||||
"i": 23,
|
||||
"j": 24,
|
||||
"k": 25,
|
||||
"l": 26,
|
||||
"m": 27,
|
||||
"n": 28,
|
||||
"o": 29,
|
||||
"p": 30,
|
||||
"q": 31,
|
||||
"r": 32,
|
||||
"s": 33,
|
||||
"t": 34,
|
||||
"u": 35,
|
||||
"v": 36,
|
||||
"w": 37,
|
||||
"x": 38,
|
||||
"z": 39,
|
||||
"¡": 40,
|
||||
"«": 41,
|
||||
"»": 42,
|
||||
"¿": 43,
|
||||
"æ": 44,
|
||||
"ç": 45,
|
||||
"ð": 46,
|
||||
"ŋ": 47,
|
||||
"ɐ": 48,
|
||||
"ɑ": 49,
|
||||
"ɔ": 50,
|
||||
"ɕ": 51,
|
||||
"ə": 52,
|
||||
"ɚ": 53,
|
||||
"ɛ": 54,
|
||||
"ɜ": 55,
|
||||
"ɟ": 56,
|
||||
"ɡ": 57,
|
||||
"ɪ": 58,
|
||||
"ɬ": 59,
|
||||
"ɯ": 60,
|
||||
"ɹ": 61,
|
||||
"ɾ": 62,
|
||||
"ʃ": 63,
|
||||
"ʈ": 64,
|
||||
"ʊ": 65,
|
||||
"ʋ": 66,
|
||||
"ʌ": 67,
|
||||
"ʑ": 68,
|
||||
"ʒ": 69,
|
||||
"ʔ": 70,
|
||||
"ʲ": 71,
|
||||
"ˈ": 72,
|
||||
"ˌ": 73,
|
||||
"ː": 74,
|
||||
"̃": 75,
|
||||
"̩": 76,
|
||||
"θ": 77,
|
||||
"ᵻ": 78,
|
||||
"—": 79,
|
||||
"“": 80,
|
||||
"”": 81,
|
||||
"…": 82,
|
||||
"ˈɛ": 83,
|
||||
"iː": 84,
|
||||
"aɪ": 85,
|
||||
"nd": 86,
|
||||
"ˈɪ": 87,
|
||||
"eɪ": 88,
|
||||
"ˈæ": 89,
|
||||
"ðə": 90,
|
||||
"oʊ": 91,
|
||||
"ɑː": 92,
|
||||
"ˈeɪ": 93,
|
||||
"ən": 94,
|
||||
"uː": 95,
|
||||
"ˈʌ": 96,
|
||||
"ˈaɪ": 97,
|
||||
"st": 98,
|
||||
"ˈɔ": 99,
|
||||
"ˈoʊ": 100,
|
||||
"ˈiː": 101,
|
||||
"ˈɑː": 102,
|
||||
"ænd": 103,
|
||||
"ːɹ": 104,
|
||||
"ɪŋ": 105,
|
||||
"ɜː": 106,
|
||||
"ɪn": 107,
|
||||
"tə": 108,
|
||||
"ʌv": 109,
|
||||
"aʊ": 110,
|
||||
"əl": 111,
|
||||
"ˈuː": 112,
|
||||
"tʃ": 113,
|
||||
"ɪz": 114,
|
||||
"ˈɜː": 115,
|
||||
"ˌʌ": 116,
|
||||
"æt": 117,
|
||||
"dʒ": 118,
|
||||
"ˈɔː": 119,
|
||||
"ɪt": 120,
|
||||
"ˈaʊ": 121,
|
||||
"ɚɹ": 122,
|
||||
"ˈɛn": 123,
|
||||
"wʌ": 124,
|
||||
"li": 125,
|
||||
"hiː": 126,
|
||||
"ˌɛ": 127,
|
||||
"wɪ": 128,
|
||||
"wʌz": 129,
|
||||
"ðæt": 130,
|
||||
"juː": 131,
|
||||
"oːɹ": 132,
|
||||
"ðɪ": 133,
|
||||
"sˈɛ": 134,
|
||||
"ˌɪ": 135,
|
||||
"ˈɑːɹ": 136,
|
||||
"nt": 137,
|
||||
"ˈʊ": 138,
|
||||
"ənt": 139,
|
||||
"hɪz": 140,
|
||||
"ˌɑː": 141,
|
||||
"hæ": 142,
|
||||
"ɔːɹ": 143,
|
||||
"ˈɛɹ": 144,
|
||||
"wɪð": 145,
|
||||
"ᵻd": 146,
|
||||
"ˈoːɹ": 147,
|
||||
"pɹ": 148,
|
||||
"ˈɔːl": 149,
|
||||
"mˌ": 150,
|
||||
"ʃən": 151,
|
||||
"kt": 152,
|
||||
"ˌoʊ": 153,
|
||||
"ˈɔːɹ": 154,
|
||||
"fɹ": 155,
|
||||
"æz": 156,
|
||||
"ˌʌt": 157,
|
||||
"ʃiː": 158,
|
||||
"ˈɛl": 159,
|
||||
"ˌaʊ": 160,
|
||||
"ˈʌn": 161,
|
||||
"əs": 162,
|
||||
"hɜː": 163,
|
||||
"lˈaɪ": 164,
|
||||
"ˈæn": 165,
|
||||
"ˈɪɹ": 166,
|
||||
"ʊd": 167,
|
||||
"ɹᵻ": 168,
|
||||
"ld": 169,
|
||||
"bˌʌt": 170,
|
||||
"ks": 171,
|
||||
"nˈoʊ": 172,
|
||||
"hæd": 173,
|
||||
"ɾɚ": 174,
|
||||
"ɛɹ": 175,
|
||||
"ˈɪŋ": 176,
|
||||
"ɡɹ": 177,
|
||||
"nˌɑː": 178,
|
||||
"ɔn": 179,
|
||||
"vɚ": 180,
|
||||
"maɪ": 181,
|
||||
"fɔːɹ": 182,
|
||||
"ðɚ": 183,
|
||||
"tʊ": 184,
|
||||
"ðɛɹ": 185,
|
||||
"nˌɑːt": 186,
|
||||
"ˈʌm": 187,
|
||||
"tɹ": 188,
|
||||
"sˈiː": 189,
|
||||
"ʌvðə": 190,
|
||||
"mˈɪ": 191,
|
||||
"hˈæ": 192,
|
||||
"ˌɪm": 193,
|
||||
"lˈeɪ": 194,
|
||||
"ɪk": 195,
|
||||
"sp": 196,
|
||||
"hˌɪm": 197,
|
||||
"ɐn": 198,
|
||||
"ðeɪ": 199,
|
||||
"lˈɪ": 200,
|
||||
"ɾi": 201,
|
||||
"lˈɛ": 202,
|
||||
"bɹ": 203,
|
||||
"kɹ": 204,
|
||||
"lˈæ": 205,
|
||||
"ˌɪn": 206,
|
||||
"bᵻ": 207,
|
||||
"wˈʌn": 208,
|
||||
"ˈɪn": 209,
|
||||
"ˈoʊn": 210,
|
||||
"biː": 211,
|
||||
"sˈɛd": 212,
|
||||
"ˈɛd": 213,
|
||||
"ˈaɪt": 214,
|
||||
"fɹʌm": 215,
|
||||
"baɪ": 216,
|
||||
"ɪs": 217,
|
||||
"ɚz": 218,
|
||||
"ðɪs": 219,
|
||||
"əns": 220,
|
||||
"ɪf": 221,
|
||||
"bəl": 222,
|
||||
"ˈænd": 223,
|
||||
"ɪnðə": 224,
|
||||
"əm": 225,
|
||||
"iːz": 226,
|
||||
"ˌuː": 227,
|
||||
"ᵻz": 228,
|
||||
"wˈeɪ": 229,
|
||||
"ft": 230,
|
||||
"wiː": 231,
|
||||
"lˈiː": 232,
|
||||
"stɹ": 233,
|
||||
"jʊ": 234,
|
||||
"ɚd": 235,
|
||||
"ˌaɪ": 236,
|
||||
"kw": 237,
|
||||
"ˌɔn": 238,
|
||||
"ˈaɪd": 239,
|
||||
"ts": 240,
|
||||
"ɪm": 241,
|
||||
"ˈʌst": 242,
|
||||
"ˈoʊld": 243,
|
||||
"ˌɪtʃ": 244,
|
||||
"dˈɪ": 245,
|
||||
"sˌoʊ": 246,
|
||||
"ɑːɹ": 247,
|
||||
"hɐ": 248,
|
||||
"sˈeɪ": 249,
|
||||
"ɾᵻd": 250,
|
||||
"dᵻ": 251,
|
||||
"wˌɪtʃ": 252,
|
||||
"sˈɛl": 253,
|
||||
"ɹi": 254,
|
||||
"ˈʌðɚ": 255
|
||||
"ˈɪl": 206,
|
||||
"jˈuː": 207,
|
||||
"ʌm": 208,
|
||||
"mˌiː": 209,
|
||||
"bᵻ": 210,
|
||||
"wˈʌn": 211,
|
||||
"ˌɪn": 212,
|
||||
"ˈɪn": 213,
|
||||
"ˈoʊn": 214,
|
||||
"sˈɛd": 215,
|
||||
"biː": 216,
|
||||
"ˈɛd": 217,
|
||||
"ˈaɪt": 218,
|
||||
"baɪ": 219,
|
||||
"fɹʌm": 220,
|
||||
"ɪs": 221,
|
||||
"ɚz": 222,
|
||||
"ðɪs": 223,
|
||||
"əns": 224,
|
||||
"bəl": 225,
|
||||
"ɪf": 226,
|
||||
"ɪnðə": 227,
|
||||
"əm": 228,
|
||||
"ᵻz": 229,
|
||||
"ˌuː": 230,
|
||||
"wˈeɪ": 231,
|
||||
"ft": 232,
|
||||
"wiː": 233,
|
||||
"stɹ": 234,
|
||||
"lˈiː": 235,
|
||||
"iːz": 236,
|
||||
"pt": 237,
|
||||
"jʊ": 238,
|
||||
"ɚd": 239,
|
||||
"ˌaɪ": 240,
|
||||
"kw": 241,
|
||||
"ˌɔn": 242,
|
||||
"ˈaɪd": 243,
|
||||
"ɪm": 244,
|
||||
"ˈʌst": 245,
|
||||
"ˈoʊld": 246,
|
||||
"ts": 247,
|
||||
"ˌɪtʃ": 248,
|
||||
"sˌoʊ": 249,
|
||||
"dˈɪ": 250,
|
||||
"ɑːɹ": 251,
|
||||
"hɐ": 252,
|
||||
"sˈeɪ": 253,
|
||||
"ɾᵻd": 254,
|
||||
"wˌɪtʃ": 255
|
||||
},
|
||||
"merges": [
|
||||
"ˈ ɛ",
|
||||
|
@ -376,8 +375,8 @@
|
|||
"ˈ ɪ",
|
||||
"e ɪ",
|
||||
"ˈ æ",
|
||||
"o ʊ",
|
||||
"ð ə",
|
||||
"o ʊ",
|
||||
"ɑ ː",
|
||||
"ˈ eɪ",
|
||||
"ə n",
|
||||
|
@ -415,20 +414,20 @@
|
|||
"h iː",
|
||||
"ˌ ɛ",
|
||||
"w ɪ",
|
||||
"ð æt",
|
||||
"wʌ z",
|
||||
"ð æt",
|
||||
"j uː",
|
||||
"o ːɹ",
|
||||
"ð ɪ",
|
||||
"s ˈɛ",
|
||||
"ˈɑː ɹ",
|
||||
"ˌ ɪ",
|
||||
"ˈɑː ɹ",
|
||||
"n t",
|
||||
"ˈ ʊ",
|
||||
"ən t",
|
||||
"h ɪz",
|
||||
"h æ",
|
||||
"ˌ ɑː",
|
||||
"h æ",
|
||||
"ɔ ːɹ",
|
||||
"ˈɛ ɹ",
|
||||
"wɪ ð",
|
||||
|
@ -443,8 +442,8 @@
|
|||
"ˈɔ ːɹ",
|
||||
"f ɹ",
|
||||
"æ z",
|
||||
"ʃ iː",
|
||||
"ˌʌ t",
|
||||
"ʃ iː",
|
||||
"ˈɛ l",
|
||||
"ˌ aʊ",
|
||||
"ˈʌ n",
|
||||
|
@ -459,93 +458,89 @@
|
|||
"b ˌʌt",
|
||||
"k s",
|
||||
"n ˈoʊ",
|
||||
"ɾ ɚ",
|
||||
"hæ d",
|
||||
"ɾ ɚ",
|
||||
"ɛ ɹ",
|
||||
"ˈɪ ŋ",
|
||||
"ɡ ɹ",
|
||||
"ɔ n",
|
||||
"n ˌɑː",
|
||||
"m aɪ",
|
||||
"ɔ n",
|
||||
"v ɚ",
|
||||
"m aɪ",
|
||||
"f ɔːɹ",
|
||||
"ð ɚ",
|
||||
"t ʊ",
|
||||
"ð ɛɹ",
|
||||
"ˈʌ m",
|
||||
"nˌɑː t",
|
||||
"ˈʌ m",
|
||||
"t ɹ",
|
||||
"s ˈiː",
|
||||
"ʌv ðə",
|
||||
"m ˈɪ",
|
||||
"ˈæ p",
|
||||
"h ˈæ",
|
||||
"ˌɪ m",
|
||||
"l ˈeɪ",
|
||||
"ɪ k",
|
||||
"s p",
|
||||
"l ˈeɪ",
|
||||
"h ˌɪm",
|
||||
"ɐ n",
|
||||
"ð eɪ",
|
||||
"l ˈɪ",
|
||||
"ɾ i",
|
||||
"b ɹ",
|
||||
"l ˈɛ",
|
||||
"b ɹ",
|
||||
"k ɹ",
|
||||
"l ˈæ",
|
||||
"ˈɪ l",
|
||||
"j ˈuː",
|
||||
"ʌ m",
|
||||
"mˌ iː",
|
||||
"l ˈæ",
|
||||
"ˌ ɪn",
|
||||
"b ᵻ",
|
||||
"w ˈʌn",
|
||||
"ˌ ɪn",
|
||||
"ˈɪ n",
|
||||
"ˈoʊ n",
|
||||
"b iː",
|
||||
"sˈɛ d",
|
||||
"b iː",
|
||||
"ˈɛ d",
|
||||
"ˈaɪ t",
|
||||
"fɹ ʌm",
|
||||
"b aɪ",
|
||||
"fɹ ʌm",
|
||||
"ɪ s",
|
||||
"ɚ z",
|
||||
"ðɪ s",
|
||||
"ən s",
|
||||
"ɪ f",
|
||||
"b əl",
|
||||
"ˈæ nd",
|
||||
"ɪ f",
|
||||
"ɪn ðə",
|
||||
"ə m",
|
||||
"iː z",
|
||||
"ˌ uː",
|
||||
"ᵻ z",
|
||||
"ˌ uː",
|
||||
"w ˈeɪ",
|
||||
"f t",
|
||||
"w iː",
|
||||
"l ˈiː",
|
||||
"st ɹ",
|
||||
"l ˈiː",
|
||||
"iː z",
|
||||
"p t",
|
||||
"j ʊ",
|
||||
"ɚ d",
|
||||
"ˌ aɪ",
|
||||
"k w",
|
||||
"ˌ ɔn",
|
||||
"ˈaɪ d",
|
||||
"t s",
|
||||
"ɪ m",
|
||||
"ˈʌ st",
|
||||
"ˈoʊ ld",
|
||||
"t s",
|
||||
"ˌɪ tʃ",
|
||||
"d ˈɪ",
|
||||
"s ˌoʊ",
|
||||
"d ˈɪ",
|
||||
"ɑː ɹ",
|
||||
"h ɐ",
|
||||
"s ˈeɪ",
|
||||
"ɾ ᵻd",
|
||||
"d ᵻ",
|
||||
"w ˌɪtʃ",
|
||||
"sˈɛ l",
|
||||
"ɹ i",
|
||||
"ˈʌ ðɚ"
|
||||
"w ˌɪtʃ"
|
||||
]
|
||||
}
|
||||
}
|
|
@ -6,16 +6,22 @@ import torchaudio
|
|||
from tqdm.auto import tqdm
|
||||
from pathlib import Path
|
||||
from vall_e.config import cfg
|
||||
|
||||
# things that could be args
|
||||
cfg.sample_rate = 44_000
|
||||
cfg.inference.audio_backend = "dac"
|
||||
"""
|
||||
cfg.inference.weight_dtype = "bfloat16"
|
||||
cfg.inference.dtype = torch.bfloat16
|
||||
cfg.inference.amp = True
|
||||
"""
|
||||
|
||||
from vall_e.emb.g2p import encode as valle_phonemize
|
||||
from vall_e.emb.qnt import encode as valle_quantize, _replace_file_extension
|
||||
|
||||
# things that could be args
|
||||
cfg.sample_rate = 24_000
|
||||
cfg.inference.audio_backend = "encodec"
|
||||
|
||||
input_audio = "voices"
|
||||
input_metadata = "./training/metadata"
|
||||
output_dataset = f"./training/data-{'2' if cfg.sample_rate == 24_000 else '4'}4KHz-{cfg.inference.audio_backend}"
|
||||
input_metadata = "metadata"
|
||||
output_dataset = f"training-{'2' if cfg.sample_rate == 24_000 else '4'}4KHz-{cfg.inference.audio_backend}"
|
||||
device = "cuda"
|
||||
|
||||
audio_extension = ".dac" if cfg.inference.audio_backend == "dac" else ".enc"
|
||||
|
@ -34,9 +40,6 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
|
|||
if not os.path.isdir(f'./{input_audio}/{dataset_name}/'):
|
||||
print("Is not dir:", f'./{input_audio}/{dataset_name}/')
|
||||
continue
|
||||
|
||||
if dataset_name in ["LibriVox", "Audiobooks"]:
|
||||
continue
|
||||
|
||||
for speaker_id in tqdm(sorted(os.listdir(f'./{input_audio}/{dataset_name}/')), desc=f"Processing speaker in {dataset_name}"):
|
||||
if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'):
|
||||
|
@ -55,10 +58,29 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
|
|||
|
||||
waveform, sample_rate = torchaudio.load(inpath)
|
||||
qnt = valle_quantize(waveform, sr=sample_rate, device=device)
|
||||
|
||||
if cfg.inference.audio_backend == "dac":
|
||||
qnt.save(_replace_file_extension(outpath, audio_extension))
|
||||
np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
|
||||
"codes": qnt.codes.numpy().astype(np.uint16),
|
||||
"metadata": {
|
||||
"original_length": qnt.original_length,
|
||||
"sample_rate": qnt.sample_rate,
|
||||
|
||||
"input_db": qnt.input_db.numpy().astype(np.float32),
|
||||
"chunk_length": qnt.chunk_length,
|
||||
"channels": qnt.channels,
|
||||
"padding": qnt.padding,
|
||||
"dac_version": "1.0.0",
|
||||
},
|
||||
})
|
||||
else:
|
||||
torch.save( qnt, _replace_file_extension(outpath, audio_extension) )
|
||||
np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
|
||||
"codes": qnt.numpy().astype(np.uint16),
|
||||
"metadata": {
|
||||
"original_length": waveform.shape[-1],
|
||||
"sample_rate": sample_rate,
|
||||
},
|
||||
})
|
||||
|
||||
continue
|
||||
|
||||
|
@ -91,7 +113,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
|
|||
fname = filename.replace(f'.{extension}', "")
|
||||
|
||||
waveform, sample_rate = None, None
|
||||
language = metadata[filename]["language"] if "language" in metadata[filename] else "english"
|
||||
language = metadata[filename]["language"] if "language" in metadata[filename] else "en"
|
||||
|
||||
if len(metadata[filename]["segments"]) == 0 or not use_slices:
|
||||
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}.{extension}')
|
||||
|
@ -100,86 +122,101 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
|
|||
if len(text) == 0:
|
||||
continue
|
||||
|
||||
if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists():
|
||||
if _replace_file_extension(outpath, audio_extension).exists():
|
||||
continue
|
||||
|
||||
if not _replace_file_extension(outpath, ".json").exists():
|
||||
txts.append((
|
||||
outpath,
|
||||
text,
|
||||
language,
|
||||
))
|
||||
|
||||
if not _replace_file_extension(outpath, audio_extension).exists():
|
||||
if waveform is None:
|
||||
waveform, sample_rate = torchaudio.load(inpath)
|
||||
if waveform is None:
|
||||
waveform, sample_rate = torchaudio.load(inpath)
|
||||
if waveform.shape[0] > 1:
|
||||
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
||||
|
||||
wavs.append((
|
||||
outpath,
|
||||
waveform,
|
||||
sample_rate
|
||||
))
|
||||
wavs.append((
|
||||
outpath,
|
||||
text,
|
||||
language,
|
||||
waveform,
|
||||
sample_rate
|
||||
))
|
||||
else:
|
||||
i = 0
|
||||
for segment in metadata[filename]["segments"]:
|
||||
id = pad(i, 4)
|
||||
i = i + 1
|
||||
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}')
|
||||
|
||||
if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists():
|
||||
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}')
|
||||
text = metadata[filename]["text"]
|
||||
|
||||
if len(text) == 0:
|
||||
continue
|
||||
|
||||
if not _replace_file_extension(outpath, ".json").exists():
|
||||
txts.append((
|
||||
outpath,
|
||||
segment["text"],
|
||||
language,
|
||||
))
|
||||
|
||||
if not _replace_file_extension(outpath, audio_extension).exists():
|
||||
if waveform is None:
|
||||
waveform, sample_rate = torchaudio.load(inpath)
|
||||
if _replace_file_extension(outpath, audio_extension).exists():
|
||||
continue
|
||||
|
||||
start = int(segment['start'] * sample_rate)
|
||||
end = int(segment['end'] * sample_rate)
|
||||
if waveform is None:
|
||||
waveform, sample_rate = torchaudio.load(inpath)
|
||||
if waveform.shape[0] > 1:
|
||||
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
||||
|
||||
if start < 0:
|
||||
start = 0
|
||||
if end >= waveform.shape[-1]:
|
||||
end = waveform.shape[-1] - 1
|
||||
start = int(segment['start'] * sample_rate)
|
||||
end = int(segment['end'] * sample_rate)
|
||||
|
||||
if end - start < 0:
|
||||
continue
|
||||
if start < 0:
|
||||
start = 0
|
||||
if end >= waveform.shape[-1]:
|
||||
end = waveform.shape[-1] - 1
|
||||
|
||||
wavs.append((
|
||||
outpath,
|
||||
waveform[:, start:end],
|
||||
sample_rate
|
||||
))
|
||||
if end - start < 0:
|
||||
continue
|
||||
|
||||
if len(txts) > 0:
|
||||
for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}", disable=True):
|
||||
outpath, text, language = job
|
||||
phones = valle_phonemize(text)
|
||||
data = {
|
||||
"text": text.strip(),
|
||||
"phonemes": phones,
|
||||
"language": language,
|
||||
}
|
||||
open(_replace_file_extension(outpath, ".json"), 'w', encoding='utf-8').write(json.dumps(data))
|
||||
wavs.append((
|
||||
outpath,
|
||||
text,
|
||||
language,
|
||||
waveform[:, start:end],
|
||||
sample_rate
|
||||
))
|
||||
|
||||
if len(wavs) > 0:
|
||||
for job in tqdm(wavs, desc=f"Quantizing: {speaker_id}"):
|
||||
try:
|
||||
outpath, waveform, sample_rate = job
|
||||
outpath, text, language, waveform, sample_rate = job
|
||||
|
||||
phones = valle_phonemize(text)
|
||||
qnt = valle_quantize(waveform, sr=sample_rate, device=device)
|
||||
|
||||
if cfg.inference.audio_backend == "dac":
|
||||
qnt.save(_replace_file_extension(outpath, audio_extension))
|
||||
np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
|
||||
"codes": qnt.codes.numpy().astype(np.uint16),
|
||||
"metadata": {
|
||||
"original_length": qnt.original_length,
|
||||
"sample_rate": qnt.sample_rate,
|
||||
|
||||
"input_db": qnt.input_db.numpy().astype(np.float32),
|
||||
"chunk_length": qnt.chunk_length,
|
||||
"channels": qnt.channels,
|
||||
"padding": qnt.padding,
|
||||
"dac_version": "1.0.0",
|
||||
|
||||
"text": text.strip(),
|
||||
"phonemes": "".join(phones),
|
||||
"language": language,
|
||||
},
|
||||
})
|
||||
else:
|
||||
torch.save( qnt, _replace_file_extension(outpath, audio_extension) )
|
||||
np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
|
||||
"codes": qnt.numpy().astype(np.uint16),
|
||||
"metadata": {
|
||||
"original_length": waveform.shape[-1],
|
||||
"sample_rate": sample_rate,
|
||||
|
||||
"text": text.strip(),
|
||||
"phonemes": "".join(phones),
|
||||
"language": language,
|
||||
},
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Failed to quantize: {outpath}:", e)
|
||||
continue
|
||||
|
||||
open("./missing.json", 'w', encoding='utf-8').write(json.dumps(missing))
|
||||
open("./dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset))
|
||||
open("./dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset))
|
|
@ -38,20 +38,23 @@ else:
|
|||
metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/{id}')
|
||||
metadata = json.loads(open(metadata_path, "r", encoding="utf-8").read())
|
||||
|
||||
if "phonemes" not in metadata:
|
||||
continue
|
||||
|
||||
tokenizer_data.append( f'{"".join(metadata["phonemes"])}' )
|
||||
|
||||
open(output_file, 'w', encoding='utf-8').write(json.dumps(tokenizer_data))
|
||||
|
||||
unk_token = "<unk>"
|
||||
spl_tokens = ["<bos>", "</eos>", unk_token, "<mask>"]
|
||||
spl_tokens = [unk_token, "<bos>", "</eos>", "<mask>", "<space>"]
|
||||
|
||||
trainer = BpeTrainer(special_tokens = spl_tokens, vocab_size = 256)
|
||||
tokenizer = Tokenizer(BPE(unk_token = unk_token))
|
||||
tokenizer.pre_tokenizer = Whitespace()
|
||||
tokenizer.pre_tokenizer = Whitespace() # takes 2 hours to process without this, we'll just manually add spaces as a token
|
||||
tokenizer.post_processor = TemplateProcessing(
|
||||
single="<bos> $A <eos>",
|
||||
special_tokens=[("<bos>", 1), ("<eos>", 2)],
|
||||
)
|
||||
|
||||
tokenizer.train_from_iterator(tokenizer_data, trainer=trainer)
|
||||
tokenizer.save("./training/tokenizer.json")
|
||||
tokenizer.save("./training/tokenizer_training_data.json")
|
281
vall_e/data.py
281
vall_e/data.py
|
@ -86,19 +86,15 @@ def _calculate_durations( type="training" ):
|
|||
def _load_paths(dataset, type="training"):
|
||||
return { cfg.get_spkr( cfg.data_dir / data_dir / "dummy" ): _load_paths_from_metadata( data_dir, type=type, validate=cfg.dataset.validate and type == "training" ) for data_dir in tqdm(dataset, desc=f"Parsing dataset: {type}") }
|
||||
|
||||
def _load_paths_from_metadata(dataset_name, type="training", validate=False):
|
||||
data_dir = dataset_name if cfg.dataset.use_hdf5 else cfg.data_dir / dataset_name
|
||||
def _load_paths_from_metadata(group_name, type="training", validate=False):
|
||||
data_dir = group_name if cfg.dataset.use_hdf5 else cfg.data_dir / group_name
|
||||
|
||||
_fn = _get_hdf5_paths if cfg.dataset.use_hdf5 else _get_paths_of_extensions
|
||||
|
||||
def key( id ):
|
||||
if not cfg.dataset.use_hdf5:
|
||||
return data_dir / id
|
||||
def key( id, entry=None ):
|
||||
return f"/{type}/{_get_hdf5_path(data_dir)}/{id}" if cfg.dataset.use_hdf5 else data_dir / id
|
||||
|
||||
return f"/{type}/{_get_hdf5_path(data_dir)}/{id}"
|
||||
|
||||
|
||||
metadata_path = cfg.metadata_dir / f'{dataset_name}.json'
|
||||
metadata_path = cfg.metadata_dir / f'{group_name}.json'
|
||||
metadata = {}
|
||||
|
||||
if cfg.dataset.use_metadata and metadata_path.exists():
|
||||
|
@ -107,10 +103,7 @@ def _load_paths_from_metadata(dataset_name, type="training", validate=False):
|
|||
if len(metadata) == 0:
|
||||
return _fn( data_dir, type if cfg.dataset.use_hdf5 else _get_quant_extension(), validate )
|
||||
|
||||
|
||||
def _validate( id ):
|
||||
entry = metadata[id]
|
||||
|
||||
def _validate( id, entry ):
|
||||
phones = entry['phones'] if "phones" in entry else 0
|
||||
duration = entry['duration'] if "duration" in entry else 0
|
||||
if type not in _total_durations:
|
||||
|
@ -118,14 +111,16 @@ def _load_paths_from_metadata(dataset_name, type="training", validate=False):
|
|||
|
||||
_total_durations[type] += duration
|
||||
|
||||
"""
|
||||
if cfg.dataset.use_hdf5:
|
||||
k = key( id )
|
||||
if k not in cfg.hdf5 or "audio" not in cfg.hdf5[k] or "text" not in cfg.hdf5[k]:
|
||||
return False
|
||||
"""
|
||||
|
||||
return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
|
||||
return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration #and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
|
||||
|
||||
return [ key(id) for id in metadata.keys() if not validate or _validate(id) ]
|
||||
return [ key(id, entry) for id, entry in metadata.items() if not validate or _validate(id, entry) ]
|
||||
|
||||
|
||||
def _get_hdf5_path(path):
|
||||
|
@ -136,16 +131,16 @@ def _get_hdf5_path(path):
|
|||
def _get_hdf5_paths( data_dir, type="training", validate=False ):
|
||||
data_dir = str(data_dir)
|
||||
|
||||
def _validate( child ):
|
||||
phones = child.attrs['phonemes']
|
||||
duration = child.attrs['duration']
|
||||
def _validate( id, entry ):
|
||||
phones = entry.attrs['phonemes']
|
||||
duration = entry.attrs['duration']
|
||||
if type not in _total_durations:
|
||||
_total_durations[type] = 0
|
||||
_total_durations[type] += child.attrs['duration']
|
||||
return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
|
||||
_total_durations[type] += entry.attrs['duration']
|
||||
return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration #and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
|
||||
|
||||
key = f"/{type}/{_get_hdf5_path(data_dir)}"
|
||||
return [ Path(f"{key}/{child.attrs['id']}") for child in cfg.hdf5[key].values() if not validate or _validate(child) ] if key in cfg.hdf5 else []
|
||||
return [ Path(f"{key}/{id}") for id, entry in cfg.hdf5[key].items() if not validate or _validate(id, entry) ] if key in cfg.hdf5 else []
|
||||
|
||||
def _get_paths_of_extensions( path, extensions=_get_quant_extension(), validate=False ):
|
||||
if isinstance(path, str):
|
||||
|
@ -807,47 +802,30 @@ def create_dataset_metadata( skip_existing=True ):
|
|||
if id not in metadata:
|
||||
metadata[id] = {}
|
||||
|
||||
# audio
|
||||
utterance_metadata = {}
|
||||
if audios:
|
||||
if _get_quant_extension() == ".dac":
|
||||
dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
|
||||
qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
|
||||
# ideally we'll encode Encodec-based audio in a similar manner because np has smaller files than pt
|
||||
dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
|
||||
qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
|
||||
|
||||
duration = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
|
||||
metadata[id]["metadata"] = {
|
||||
"original_length": dac["metadata"]["original_length"],
|
||||
"sample_rate": dac["metadata"]["sample_rate"],
|
||||
}
|
||||
else:
|
||||
qnt = torch.load(f'{root}/{name}/{id}{_get_quant_extension()}')[0].t()
|
||||
duration = qnt.shape[0] / cfg.dataset.frames_per_second
|
||||
|
||||
metadata[id]["duration"] = duration
|
||||
else:
|
||||
metadata[id]["duration"] = 0
|
||||
|
||||
if "text" in dac["metadata"]:
|
||||
utterance_metadata["text"] = dac["metadata"]["text"]
|
||||
if "phonemes" in dac["metadata"]:
|
||||
utterance_metadata["phonemes"] = dac["metadata"]["phonemes"]
|
||||
if "language" in dac["metadata"]:
|
||||
utterance_metadata["language"] = dac["metadata"]["language"]
|
||||
if "original_length" in dac["metadata"] and "sample_rate" in dac["metadata"]:
|
||||
utterance_metadata["duration"] = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
|
||||
# text
|
||||
if texts:
|
||||
if _get_phone_extension() == ".json":
|
||||
json_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
|
||||
content = json_metadata["phonemes"]
|
||||
txt = json_metadata["text"]
|
||||
lang = json_metadata["language"][:2]
|
||||
else:
|
||||
content = open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read().split(" ")
|
||||
txt = ""
|
||||
lang = "en"
|
||||
if not utterance_metadata:
|
||||
utterance_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
|
||||
|
||||
phn = cfg.tokenizer.encode("".join(content))
|
||||
phn = np.array(phn).astype(np.uint8)
|
||||
for k, v in utterance_metadata.items():
|
||||
metadata[id][k] = v
|
||||
|
||||
metadata[id]["phones"] = len(phn)
|
||||
metadata[id]["transcription"] = txt
|
||||
metadata[id]["language"] = lang
|
||||
except Exception as e:
|
||||
#raise e
|
||||
print(id, e)
|
||||
#pass
|
||||
tqdm.write(f'Error while processing {id}: {e}')
|
||||
|
||||
with open(str(metadata_path), "w", encoding="utf-8") as f:
|
||||
f.write( json.dumps( metadata ) )
|
||||
|
@ -900,84 +878,68 @@ def create_dataset_hdf5( skip_existing=True ):
|
|||
|
||||
for id in tqdm(ids, desc=f"Processing {name}"):
|
||||
try:
|
||||
audio_exists = os.path.exists(f'{root}/{name}/{id}{_get_quant_extension()}') if audios else True
|
||||
text_exists = os.path.exists(f'{root}/{name}/{id}{_get_phone_extension()}') if texts else True
|
||||
audio_exists = os.path.exists(f'{root}/{name}/{id}{_get_quant_extension()}')
|
||||
text_exists = os.path.exists(f'{root}/{name}/{id}{_get_phone_extension()}') if type != "Noise" else True
|
||||
|
||||
if not audio_exists or not text_exists:
|
||||
if not audio_exists:
|
||||
continue
|
||||
|
||||
key = f'{type}/{speaker_name}/{id}'
|
||||
|
||||
"""
|
||||
if skip_existing and key in hf:
|
||||
continue
|
||||
"""
|
||||
|
||||
group = hf.create_group(key) if key not in hf else hf[key]
|
||||
|
||||
"""
|
||||
group.attrs['id'] = id
|
||||
group.attrs['type'] = type
|
||||
group.attrs['speaker'] = speaker_name
|
||||
"""
|
||||
|
||||
if id not in metadata:
|
||||
metadata[id] = {}
|
||||
|
||||
utterance_metadata = {}
|
||||
|
||||
# audio
|
||||
if audios:
|
||||
if _get_quant_extension() == ".dac":
|
||||
dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
|
||||
qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
|
||||
# ideally we'll encode Encodec-based audio in a similar manner because np has smaller files than pt
|
||||
dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
|
||||
qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
|
||||
|
||||
duration = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
|
||||
metadata[id]["metadata"] = {
|
||||
"original_length": dac["metadata"]["original_length"],
|
||||
"sample_rate": dac["metadata"]["sample_rate"],
|
||||
}
|
||||
else:
|
||||
qnt = torch.load(f'{root}/{name}/{id}{_get_quant_extension()}')[0].t()
|
||||
duration = qnt.shape[0] / cfg.dataset.frames_per_second
|
||||
|
||||
qnt = qnt.numpy().astype(np.int16)
|
||||
if "text" in dac["metadata"]:
|
||||
utterance_metadata["text"] = dac["metadata"]["text"]
|
||||
if "phonemes" in dac["metadata"]:
|
||||
utterance_metadata["phonemes"] = dac["metadata"]["phonemes"]
|
||||
if "language" in dac["metadata"]:
|
||||
utterance_metadata["language"] = dac["metadata"]["language"]
|
||||
if "original_length" in dac["metadata"] and "sample_rate" in dac["metadata"]:
|
||||
utterance_metadata["duration"] = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
|
||||
|
||||
if "audio" not in group:
|
||||
group.create_dataset('audio', data=qnt, compression='lzf')
|
||||
group.create_dataset('audio', data=qnt.numpy().astype(np.int16), compression='lzf')
|
||||
|
||||
group.attrs['duration'] = duration
|
||||
metadata[id]["duration"] = duration
|
||||
else:
|
||||
group.attrs['duration'] = 0
|
||||
metadata[id]["duration"] = 0
|
||||
|
||||
# text
|
||||
if texts:
|
||||
if _get_phone_extension() == ".json":
|
||||
json_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
|
||||
content = json_metadata["phonemes"]
|
||||
txt = json_metadata["text"]
|
||||
lang = json_metadata["language"][:2]
|
||||
else:
|
||||
content = open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read().split(" ")
|
||||
txt = ""
|
||||
lang = "en"
|
||||
if not utterance_metadata and text_exists:
|
||||
utterance_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
|
||||
|
||||
phn = cfg.tokenizer.encode("".join(content))
|
||||
phn = "".join(utterance_metadata["phonemes"])
|
||||
phn = cfg.tokenizer.encode(phn)
|
||||
phn = np.array(phn).astype(np.uint8)
|
||||
|
||||
if "text" not in group:
|
||||
group.create_dataset('text', data=phn, compression='lzf')
|
||||
|
||||
group.attrs['phonemes'] = len(phn)
|
||||
group.attrs['transcription'] = txt
|
||||
group.attrs['language'] = lang
|
||||
for k, v in utterance_metadata.items():
|
||||
group.attrs[k] = v
|
||||
metadata[id][k] = v
|
||||
|
||||
metadata[id]["phones"] = len(phn)
|
||||
metadata[id]["transcription"] = txt
|
||||
metadata[id]["language"] = lang
|
||||
else:
|
||||
group.attrs['phonemes'] = 0
|
||||
metadata[id]["phones"] = 0
|
||||
except Exception as e:
|
||||
#raise e
|
||||
print(id, e)
|
||||
#pass
|
||||
tqdm.write(f'Error while processing {id}: {e}')
|
||||
|
||||
with open(str(metadata_path), "w", encoding="utf-8") as f:
|
||||
f.write( json.dumps( metadata ) )
|
||||
|
@ -1002,119 +964,6 @@ def create_dataset_hdf5( skip_existing=True ):
|
|||
hf.create_dataset('symmap', data=json.dumps(symmap))
|
||||
hf.close()
|
||||
|
||||
def extract_dataset_hdf5( skip_existing=True ):
|
||||
cfg.dataset.use_hdf5 = True
|
||||
cfg.load_hdf5(write=False)
|
||||
hf = cfg.hdf5
|
||||
|
||||
symmap = get_phone_symmap()
|
||||
|
||||
reverse_symmap = {"1":"<s>","2":"</s>","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"dˌ","12":"mˌ","13":"d","14":"ɹ","15":"tˈ","16":"pˌ","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"wˌ","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"hˌ","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"bˌ","75":"kˈ","76":"ɔ","77":"zˈ","78":"ᵻ","79":"kˌ","80":"vˈ","81":"fˌ","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"tˌ","86":"pˈ","87":"ðˌ","88":"sˌ","89":"nˌ","90":"lˌ","91":"̩","92":"ʔ","93":"vˌ","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"jˌ","101":"uːˈ","102":"iːˈ","103":"zˌ","104":".ˈ","105":"…","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"iˌ","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"-ˌ","127":"ɫ","128":"q","129":"—","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":".ˌ","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"?ˌ","150":",ˌ","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"!ˌ","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"qˌ","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"”","180":"“","181":"“ˈ","182":"“ˌ","183":";ˈ","184":";ˌ","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"ᵝ","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"oˌ","221":"eˈ","222":"ʍ","223":"eˌ","224":"uˌ","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"}
|
||||
|
||||
root = str(cfg.data_dir)
|
||||
|
||||
def add( type="training", audios=True, texts=True ):
|
||||
for group in tqdm( hf[f'{type}/data/'].keys(), desc=f"Processing {type}"):
|
||||
for name in tqdm( hf[f'{type}/data/{group}'].keys(), desc=f"Processing {group}"):
|
||||
(cfg.data_dir / group / name).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for id in tqdm( hf[f'{type}/data/{group}/{name}'].keys(), desc=f"Processing {name}"):
|
||||
try:
|
||||
key = f'{type}/data/{group}/{name}/{id}'
|
||||
|
||||
if key not in hf:
|
||||
tqdm.write(f'Missing key: {key}')
|
||||
continue
|
||||
|
||||
audio_exists = "audio" in hf[key]
|
||||
text_exists = "text" in hf[key]
|
||||
|
||||
if not audio_exists or not text_exists:
|
||||
tqdm.write(f'Missing audio/text: {key}')
|
||||
continue
|
||||
|
||||
audio_path = Path(f'{root}/{group}/{name}/{id}.enc')
|
||||
text_path = Path(f'{root}/{group}/{name}/{id}.json')
|
||||
|
||||
# audio
|
||||
if audios and audio_exists and not audio_path.exists():
|
||||
qnt = hf[key]["audio"][:, :]
|
||||
torch.save( qnt, audio_path )
|
||||
|
||||
# text
|
||||
if texts and text_exists and not text_path.exists():
|
||||
tokens = hf[key]["text"][:][1:-1]
|
||||
phones = [ reverse_symmap[f'{token}'] for token in tokens ]
|
||||
phones = list("".join(phones).replace(" ", " "))
|
||||
|
||||
j = {
|
||||
"text": "",
|
||||
"phonemes": phones,
|
||||
"language": "en"
|
||||
}
|
||||
|
||||
with open(text_path, "w", encoding="utf-8") as f:
|
||||
f.write( json.dumps( j ) )
|
||||
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
add( type="training" )
|
||||
add( type="validation" )
|
||||
add( type="noise", texts=False )
|
||||
|
||||
hf.close()
|
||||
|
||||
def retokenize_dataset_hdf5( skip_existing=True ):
|
||||
cfg.dataset.use_hdf5 = True
|
||||
cfg.load_hdf5(write=True)
|
||||
hf = cfg.hdf5
|
||||
|
||||
symmap = get_phone_symmap()
|
||||
reverse_symmap = {"1":"<s>","2":"</s>","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"dˌ","12":"mˌ","13":"d","14":"ɹ","15":"tˈ","16":"pˌ","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"wˌ","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"hˌ","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"bˌ","75":"kˈ","76":"ɔ","77":"zˈ","78":"ᵻ","79":"kˌ","80":"vˈ","81":"fˌ","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"tˌ","86":"pˈ","87":"ðˌ","88":"sˌ","89":"nˌ","90":"lˌ","91":"̩","92":"ʔ","93":"vˌ","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"jˌ","101":"uːˈ","102":"iːˈ","103":"zˌ","104":".ˈ","105":"…","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"iˌ","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"-ˌ","127":"ɫ","128":"q","129":"—","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":".ˌ","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"?ˌ","150":",ˌ","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"!ˌ","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"qˌ","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"”","180":"“","181":"“ˈ","182":"“ˌ","183":";ˈ","184":";ˌ","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"ᵝ","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"oˌ","221":"eˈ","222":"ʍ","223":"eˌ","224":"uˌ","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"}
|
||||
|
||||
root = str(cfg.data_dir)
|
||||
|
||||
def add( type="training" ):
|
||||
for group in tqdm( hf[f'{type}/data/'].keys(), desc=f"Processing {type}"):
|
||||
for name in tqdm( hf[f'{type}/data/{group}'].keys(), desc=f"Processing {group}"):
|
||||
(cfg.data_dir / group / name).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for id in tqdm( hf[f'{type}/data/{group}/{name}'].keys(), desc=f"Processing {name}"):
|
||||
try:
|
||||
key = f'{type}/data/{group}/{name}/{id}'
|
||||
|
||||
if key not in hf:
|
||||
tqdm.write(f'Missing key: {key}')
|
||||
continue
|
||||
|
||||
if "text" not in hf[key]:
|
||||
tqdm.write(f'Missing text: {key}')
|
||||
continue
|
||||
|
||||
# text
|
||||
tokens = hf[key]["text"][:][1:-1]
|
||||
content = list("".join([ reverse_symmap[f'{token}'] for token in tokens ]).replace(" ", " "))
|
||||
|
||||
tokens = cfg.tokenizer.encode("".join(content))
|
||||
tokens = np.array(tokens).astype(np.uint8)
|
||||
|
||||
del hf[key]['text']
|
||||
hf[key].create_dataset('text', data=tokens, compression='lzf')
|
||||
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
add( type="training" )
|
||||
add( type="validation" )
|
||||
|
||||
# write symmap
|
||||
if "symmap" in hf:
|
||||
del hf['symmap']
|
||||
|
||||
hf.create_dataset('symmap', data=json.dumps(symmap))
|
||||
hf.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
|
@ -1135,10 +984,6 @@ if __name__ == "__main__":
|
|||
|
||||
if args.action == "hdf5":
|
||||
create_dataset_hdf5()
|
||||
if args.action == "extract-hdf5":
|
||||
extract_dataset_hdf5()
|
||||
if args.action == "retokenize-hdf5":
|
||||
retokenize_dataset_hdf5()
|
||||
elif args.action == "list-dataset":
|
||||
dataset = []
|
||||
for group in os.listdir(cfg.data_dir):
|
||||
|
@ -1147,7 +992,7 @@ if __name__ == "__main__":
|
|||
continue
|
||||
dataset.append(f'{group}/{name}')
|
||||
|
||||
print(dataset)
|
||||
print(json.dumps(dataset))
|
||||
elif args.action == "metadata":
|
||||
create_dataset_metadata()
|
||||
elif args.action == "sample":
|
||||
|
|
Loading…
Reference in New Issue
Block a user