final tweaks, hopefully, again

2024-05-15 23:04:19 -05:00 · 2024-05-15 23:04:19 -05:00 · d9aabfa3ae
commit d9aabfa3ae
parent 8d79f78e0a
4 changed files with 450 additions and 570 deletions
--- a/data/tokenizer.json
+++ b/data/tokenizer.json
@ -41,9 +41,7 @@
    }
  ],
  "normalizer": null,
-  "pre_tokenizer": {
+  "pre_tokenizer": null,
    "type": "Whitespace"
  },
  "post_processor": {
    "type": "TemplateProcessing",
    "single": [
@ -110,263 +108,264 @@
    "end_of_word_suffix": null,
    "fuse_unk": false,
    "byte_fallback": false,
    "ignore_merges": false,
    "vocab": {
      "<unk>": 0,
      "<bos>": 1,
      "</eos>": 2,
      "<mask>": 3,
-      "!": 4,
+      " ": 4,
-      "\"": 5,
+      "!": 5,
-      "(": 6,
+      "\"": 6,
-      ")": 7,
+      "(": 7,
-      ",": 8,
+      ")": 8,
-      "-": 9,
+      ",": 9,
-      ".": 10,
+      "-": 10,
-      "1": 11,
+      ".": 11,
-      ":": 12,
+      "1": 12,
-      "?": 13,
+      ":": 13,
-      "a": 14,
+      ";": 14,
-      "b": 15,
+      "?": 15,
-      "c": 16,
+      "a": 16,
-      "d": 17,
+      "b": 17,
-      "e": 18,
+      "c": 18,
-      "f": 19,
+      "d": 19,
-      "h": 20,
+      "e": 20,
-      "i": 21,
+      "f": 21,
-      "j": 22,
+      "h": 22,
-      "k": 23,
+      "i": 23,
-      "l": 24,
+      "j": 24,
-      "m": 25,
+      "k": 25,
-      "n": 26,
+      "l": 26,
-      "o": 27,
+      "m": 27,
-      "p": 28,
+      "n": 28,
-      "q": 29,
+      "o": 29,
-      "r": 30,
+      "p": 30,
-      "s": 31,
+      "q": 31,
-      "t": 32,
+      "r": 32,
-      "u": 33,
+      "s": 33,
-      "v": 34,
+      "t": 34,
-      "w": 35,
+      "u": 35,
-      "x": 36,
+      "v": 36,
-      "z": 37,
+      "w": 37,
-      "¡": 38,
+      "x": 38,
-      "«": 39,
+      "z": 39,
-      "»": 40,
+      "¡": 40,
-      "¿": 41,
+      "«": 41,
-      "æ": 42,
+      "»": 42,
-      "ç": 43,
+      "¿": 43,
-      "ð": 44,
+      "æ": 44,
-      "ŋ": 45,
+      "ç": 45,
-      "ɐ": 46,
+      "ð": 46,
-      "ɑ": 47,
+      "ŋ": 47,
-      "ɔ": 48,
+      "ɐ": 48,
-      "ɕ": 49,
+      "ɑ": 49,
-      "ə": 50,
+      "ɔ": 50,
-      "ɚ": 51,
+      "ɕ": 51,
-      "ɛ": 52,
+      "ə": 52,
-      "ɜ": 53,
+      "ɚ": 53,
-      "ɟ": 54,
+      "ɛ": 54,
-      "ɡ": 55,
+      "ɜ": 55,
-      "ɪ": 56,
+      "ɟ": 56,
-      "ɬ": 57,
+      "ɡ": 57,
-      "ɯ": 58,
+      "ɪ": 58,
-      "ɹ": 59,
+      "ɬ": 59,
-      "ɾ": 60,
+      "ɯ": 60,
-      "ʃ": 61,
+      "ɹ": 61,
-      "ʈ": 62,
+      "ɾ": 62,
-      "ʊ": 63,
+      "ʃ": 63,
-      "ʋ": 64,
+      "ʈ": 64,
-      "ʌ": 65,
+      "ʊ": 65,
-      "ʑ": 66,
+      "ʋ": 66,
-      "ʒ": 67,
+      "ʌ": 67,
-      "ʔ": 68,
+      "ʑ": 68,
-      "ʲ": 69,
+      "ʒ": 69,
-      "ˈ": 70,
+      "ʔ": 70,
-      "ˌ": 71,
+      "ʲ": 71,
-      "ː": 72,
+      "ˈ": 72,
-      "̃": 73,
+      "ˌ": 73,
-      "̩": 74,
+      "ː": 74,
-      "θ": 75,
+      "̃": 75,
-      "ᵻ": 76,
+      "̩": 76,
-      "—": 77,
+      "θ": 77,
-      "…": 78,
+      "ᵻ": 78,
-      "ˈɛ": 79,
+      "—": 79,
-      "iː": 80,
+      "“": 80,
-      "aɪ": 81,
+      "”": 81,
-      "nd": 82,
+      "…": 82,
-      "ˈɪ": 83,
+      "ˈɛ": 83,
-      "eɪ": 84,
+      "iː": 84,
-      "ˈæ": 85,
+      "aɪ": 85,
-      "oʊ": 86,
+      "nd": 86,
-      "ðə": 87,
+      "ˈɪ": 87,
-      "ɑː": 88,
+      "eɪ": 88,
-      "ˈeɪ": 89,
+      "ˈæ": 89,
-      "ən": 90,
+      "ðə": 90,
-      "uː": 91,
+      "oʊ": 91,
-      "ˈʌ": 92,
+      "ɑː": 92,
-      "ˈaɪ": 93,
+      "ˈeɪ": 93,
-      "st": 94,
+      "ən": 94,
-      "ˈɔ": 95,
+      "uː": 95,
-      "ˈoʊ": 96,
+      "ˈʌ": 96,
-      "ˈiː": 97,
+      "ˈaɪ": 97,
-      "ˈɑː": 98,
+      "st": 98,
-      "ænd": 99,
+      "ˈɔ": 99,
-      "ːɹ": 100,
+      "ˈoʊ": 100,
-      "ɪŋ": 101,
+      "ˈiː": 101,
-      "ɜː": 102,
+      "ˈɑː": 102,
-      "ɪn": 103,
+      "ænd": 103,
-      "tə": 104,
+      "ːɹ": 104,
-      "ʌv": 105,
+      "ɪŋ": 105,
-      "aʊ": 106,
+      "ɜː": 106,
-      "əl": 107,
+      "ɪn": 107,
-      "ˈuː": 108,
+      "tə": 108,
-      "tʃ": 109,
+      "ʌv": 109,
-      "ɪz": 110,
+      "aʊ": 110,
-      "ˈɜː": 111,
+      "əl": 111,
-      "ˌʌ": 112,
+      "ˈuː": 112,
-      "æt": 113,
+      "tʃ": 113,
-      "dʒ": 114,
+      "ɪz": 114,
-      "ˈɔː": 115,
+      "ˈɜː": 115,
-      "ɪt": 116,
+      "ˌʌ": 116,
-      "ˈaʊ": 117,
+      "æt": 117,
-      "ɚɹ": 118,
+      "dʒ": 118,
-      "ˈɛn": 119,
+      "ˈɔː": 119,
-      "wʌ": 120,
+      "ɪt": 120,
-      "li": 121,
+      "ˈaʊ": 121,
-      "hiː": 122,
+      "ɚɹ": 122,
-      "ˌɛ": 123,
+      "ˈɛn": 123,
-      "wɪ": 124,
+      "wʌ": 124,
-      "ðæt": 125,
+      "li": 125,
-      "wʌz": 126,
+      "hiː": 126,
-      "juː": 127,
+      "ˌɛ": 127,
-      "oːɹ": 128,
+      "wɪ": 128,
-      "ðɪ": 129,
+      "wʌz": 129,
-      "sˈɛ": 130,
+      "ðæt": 130,
-      "ˈɑːɹ": 131,
+      "juː": 131,
-      "ˌɪ": 132,
+      "oːɹ": 132,
-      "nt": 133,
+      "ðɪ": 133,
-      "ˈʊ": 134,
+      "sˈɛ": 134,
-      "ənt": 135,
+      "ˌɪ": 135,
-      "hɪz": 136,
+      "ˈɑːɹ": 136,
-      "hæ": 137,
+      "nt": 137,
-      "ˌɑː": 138,
+      "ˈʊ": 138,
-      "ɔːɹ": 139,
+      "ənt": 139,
-      "ˈɛɹ": 140,
+      "hɪz": 140,
-      "wɪð": 141,
+      "ˌɑː": 141,
-      "ᵻd": 142,
+      "hæ": 142,
-      "ˈoːɹ": 143,
+      "ɔːɹ": 143,
-      "pɹ": 144,
+      "ˈɛɹ": 144,
-      "ˈɔːl": 145,
+      "wɪð": 145,
-      "mˌ": 146,
+      "ᵻd": 146,
-      "ʃən": 147,
+      "ˈoːɹ": 147,
-      "kt": 148,
+      "pɹ": 148,
-      "ˌoʊ": 149,
+      "ˈɔːl": 149,
-      "ˈɔːɹ": 150,
+      "mˌ": 150,
-      "fɹ": 151,
+      "ʃən": 151,
-      "æz": 152,
+      "kt": 152,
-      "ʃiː": 153,
+      "ˌoʊ": 153,
-      "ˌʌt": 154,
+      "ˈɔːɹ": 154,
-      "ˈɛl": 155,
+      "fɹ": 155,
-      "ˌaʊ": 156,
+      "æz": 156,
-      "ˈʌn": 157,
+      "ˌʌt": 157,
-      "əs": 158,
+      "ʃiː": 158,
-      "hɜː": 159,
+      "ˈɛl": 159,
-      "lˈaɪ": 160,
+      "ˌaʊ": 160,
-      "ˈæn": 161,
+      "ˈʌn": 161,
-      "ˈɪɹ": 162,
+      "əs": 162,
-      "ʊd": 163,
+      "hɜː": 163,
-      "ɹᵻ": 164,
+      "lˈaɪ": 164,
-      "ld": 165,
+      "ˈæn": 165,
-      "bˌʌt": 166,
+      "ˈɪɹ": 166,
-      "ks": 167,
+      "ʊd": 167,
-      "nˈoʊ": 168,
+      "ɹᵻ": 168,
-      "ɾɚ": 169,
+      "ld": 169,
-      "hæd": 170,
+      "bˌʌt": 170,
-      "ɛɹ": 171,
+      "ks": 171,
-      "ˈɪŋ": 172,
+      "nˈoʊ": 172,
-      "ɡɹ": 173,
+      "hæd": 173,
-      "ɔn": 174,
+      "ɾɚ": 174,
-      "nˌɑː": 175,
+      "ɛɹ": 175,
-      "maɪ": 176,
+      "ˈɪŋ": 176,
-      "vɚ": 177,
+      "ɡɹ": 177,
-      "fɔːɹ": 178,
+      "nˌɑː": 178,
-      "ðɚ": 179,
+      "ɔn": 179,
-      "tʊ": 180,
+      "vɚ": 180,
-      "ðɛɹ": 181,
+      "maɪ": 181,
-      "ˈʌm": 182,
+      "fɔːɹ": 182,
-      "nˌɑːt": 183,
+      "ðɚ": 183,
-      "tɹ": 184,
+      "tʊ": 184,
-      "sˈiː": 185,
+      "ðɛɹ": 185,
-      "ʌvðə": 186,
+      "nˌɑːt": 186,
-      "mˈɪ": 187,
+      "ˈʌm": 187,
-      "ˈæp": 188,
+      "tɹ": 188,
-      "ˌɪm": 189,
+      "sˈiː": 189,
-      "ɪk": 190,
+      "ʌvðə": 190,
-      "sp": 191,
+      "mˈɪ": 191,
-      "lˈeɪ": 192,
+      "hˈæ": 192,
-      "hˌɪm": 193,
+      "ˌɪm": 193,
-      "ɐn": 194,
+      "lˈeɪ": 194,
-      "ðeɪ": 195,
+      "ɪk": 195,
-      "lˈɪ": 196,
+      "sp": 196,
-      "ɾi": 197,
+      "hˌɪm": 197,
-      "bɹ": 198,
+      "ɐn": 198,
-      "lˈɛ": 199,
+      "ðeɪ": 199,
-      "kɹ": 200,
+      "lˈɪ": 200,
-      "ˈɪl": 201,
+      "ɾi": 201,
-      "jˈuː": 202,
+      "lˈɛ": 202,
-      "ʌm": 203,
+      "bɹ": 203,
-      "mˌiː": 204,
+      "kɹ": 204,
      "lˈæ": 205,
-      "ˌɪn": 206,
+      "ˈɪl": 206,
-      "bᵻ": 207,
+      "jˈuː": 207,
-      "wˈʌn": 208,
+      "ʌm": 208,
-      "ˈɪn": 209,
+      "mˌiː": 209,
-      "ˈoʊn": 210,
+      "bᵻ": 210,
-      "biː": 211,
+      "wˈʌn": 211,
-      "sˈɛd": 212,
+      "ˌɪn": 212,
-      "ˈɛd": 213,
+      "ˈɪn": 213,
-      "ˈaɪt": 214,
+      "ˈoʊn": 214,
-      "fɹʌm": 215,
+      "sˈɛd": 215,
-      "baɪ": 216,
+      "biː": 216,
-      "ɪs": 217,
+      "ˈɛd": 217,
-      "ɚz": 218,
+      "ˈaɪt": 218,
-      "ðɪs": 219,
+      "baɪ": 219,
-      "əns": 220,
+      "fɹʌm": 220,
-      "ɪf": 221,
+      "ɪs": 221,
-      "bəl": 222,
+      "ɚz": 222,
-      "ˈænd": 223,
+      "ðɪs": 223,
-      "ɪnðə": 224,
+      "əns": 224,
-      "əm": 225,
+      "bəl": 225,
-      "iːz": 226,
+      "ɪf": 226,
-      "ˌuː": 227,
+      "ɪnðə": 227,
-      "ᵻz": 228,
+      "əm": 228,
-      "wˈeɪ": 229,
+      "ᵻz": 229,
-      "ft": 230,
+      "ˌuː": 230,
-      "wiː": 231,
+      "wˈeɪ": 231,
-      "lˈiː": 232,
+      "ft": 232,
-      "stɹ": 233,
+      "wiː": 233,
-      "jʊ": 234,
+      "stɹ": 234,
-      "ɚd": 235,
+      "lˈiː": 235,
-      "ˌaɪ": 236,
+      "iːz": 236,
-      "kw": 237,
+      "pt": 237,
-      "ˌɔn": 238,
+      "jʊ": 238,
-      "ˈaɪd": 239,
+      "ɚd": 239,
-      "ts": 240,
+      "ˌaɪ": 240,
-      "ɪm": 241,
+      "kw": 241,
-      "ˈʌst": 242,
+      "ˌɔn": 242,
-      "ˈoʊld": 243,
+      "ˈaɪd": 243,
-      "ˌɪtʃ": 244,
+      "ɪm": 244,
-      "dˈɪ": 245,
+      "ˈʌst": 245,
-      "sˌoʊ": 246,
+      "ˈoʊld": 246,
-      "ɑːɹ": 247,
+      "ts": 247,
-      "hɐ": 248,
+      "ˌɪtʃ": 248,
-      "sˈeɪ": 249,
+      "sˌoʊ": 249,
-      "ɾᵻd": 250,
+      "dˈɪ": 250,
-      "dᵻ": 251,
+      "ɑːɹ": 251,
-      "wˌɪtʃ": 252,
+      "hɐ": 252,
-      "sˈɛl": 253,
+      "sˈeɪ": 253,
-      "ɹi": 254,
+      "ɾᵻd": 254,
-      "ˈʌðɚ": 255
+      "wˌɪtʃ": 255
    },
    "merges": [
      "ˈ ɛ",
@ -376,8 +375,8 @@
      "ˈ ɪ",
      "e ɪ",
      "ˈ æ",
      "o ʊ",
      "ð ə",
      "o ʊ",
      "ɑ ː",
      "ˈ eɪ",
      "ə n",
@ -415,20 +414,20 @@
      "h iː",
      "ˌ ɛ",
      "w ɪ",
      "ð æt",
      "wʌ z",
      "ð æt",
      "j uː",
      "o ːɹ",
      "ð ɪ",
      "s ˈɛ",
      "ˈɑː ɹ",
      "ˌ ɪ",
      "ˈɑː ɹ",
      "n t",
      "ˈ ʊ",
      "ən t",
      "h ɪz",
      "h æ",
      "ˌ ɑː",
      "h æ",
      "ɔ ːɹ",
      "ˈɛ ɹ",
      "wɪ ð",
@ -443,8 +442,8 @@
      "ˈɔ ːɹ",
      "f ɹ",
      "æ z",
      "ʃ iː",
      "ˌʌ t",
      "ʃ iː",
      "ˈɛ l",
      "ˌ aʊ",
      "ˈʌ n",
@ -459,93 +458,89 @@
      "b ˌʌt",
      "k s",
      "n ˈoʊ",
      "ɾ ɚ",
      "hæ d",
      "ɾ ɚ",
      "ɛ ɹ",
      "ˈɪ ŋ",
      "ɡ ɹ",
      "ɔ n",
      "n ˌɑː",
-      "m aɪ",
+      "ɔ n",
      "v ɚ",
      "m aɪ",
      "f ɔːɹ",
      "ð ɚ",
      "t ʊ",
      "ð ɛɹ",
      "ˈʌ m",
      "nˌɑː t",
      "ˈʌ m",
      "t ɹ",
      "s ˈiː",
      "ʌv ðə",
      "m ˈɪ",
-      "ˈæ p",
+      "h ˈæ",
      "ˌɪ m",
      "l ˈeɪ",
      "ɪ k",
      "s p",
      "l ˈeɪ",
      "h ˌɪm",
      "ɐ n",
      "ð eɪ",
      "l ˈɪ",
      "ɾ i",
      "b ɹ",
      "l ˈɛ",
      "b ɹ",
      "k ɹ",
      "l ˈæ",
      "ˈɪ l",
      "j ˈuː",
      "ʌ m",
      "mˌ iː",
      "l ˈæ",
      "ˌ ɪn",
      "b ᵻ",
      "w ˈʌn",
      "ˌ ɪn",
      "ˈɪ n",
      "ˈoʊ n",
      "b iː",
      "sˈɛ d",
      "b iː",
      "ˈɛ d",
      "ˈaɪ t",
      "fɹ ʌm",
      "b aɪ",
      "fɹ ʌm",
      "ɪ s",
      "ɚ z",
      "ðɪ s",
      "ən s",
      "ɪ f",
      "b əl",
-      "ˈæ nd",
+      "ɪ f",
      "ɪn ðə",
      "ə m",
      "iː z",
      "ˌ uː",
      "ᵻ z",
      "ˌ uː",
      "w ˈeɪ",
      "f t",
      "w iː",
      "l ˈiː",
      "st ɹ",
      "l ˈiː",
      "iː z",
      "p t",
      "j ʊ",
      "ɚ d",
      "ˌ aɪ",
      "k w",
      "ˌ ɔn",
      "ˈaɪ d",
      "t s",
      "ɪ m",
      "ˈʌ st",
      "ˈoʊ ld",
      "t s",
      "ˌɪ tʃ",
      "d ˈɪ",
      "s ˌoʊ",
      "d ˈɪ",
      "ɑː ɹ",
      "h ɐ",
      "s ˈeɪ",
      "ɾ ᵻd",
-      "d ᵻ",
+      "w ˌɪtʃ"
      "w ˌɪtʃ",
      "sˈɛ l",
      "ɹ i",
      "ˈʌ ðɚ"
    ]
  }
 }
--- a/scripts/process_dataset.py
+++ b/scripts/process_dataset.py
@ -6,16 +6,22 @@ import torchaudio
 from tqdm.auto import tqdm
 from pathlib import Path
 from vall_e.config import cfg
 # things that could be args
 cfg.sample_rate = 44_000
 cfg.inference.audio_backend = "dac"
 """
 cfg.inference.weight_dtype = "bfloat16"
 cfg.inference.dtype = torch.bfloat16
 cfg.inference.amp = True
 """
 from vall_e.emb.g2p import encode as valle_phonemize
 from vall_e.emb.qnt import encode as valle_quantize, _replace_file_extension
 # things that could be args
 cfg.sample_rate = 24_000
 cfg.inference.audio_backend = "encodec"
 input_audio = "voices"
-input_metadata = "./training/metadata"
+input_metadata = "metadata"
-output_dataset = f"./training/data-{'2' if cfg.sample_rate == 24_000 else '4'}4KHz-{cfg.inference.audio_backend}"
+output_dataset = f"training-{'2' if cfg.sample_rate == 24_000 else '4'}4KHz-{cfg.inference.audio_backend}"
 device = "cuda"
 audio_extension = ".dac" if cfg.inference.audio_backend == "dac" else ".enc"
@ -34,9 +40,6 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
 	if not os.path.isdir(f'./{input_audio}/{dataset_name}/'):
 		print("Is not dir:", f'./{input_audio}/{dataset_name}/')
 		continue
 	if dataset_name in ["LibriVox", "Audiobooks"]:
 		continue
 	for speaker_id in tqdm(sorted(os.listdir(f'./{input_audio}/{dataset_name}/')), desc=f"Processing speaker in {dataset_name}"):
 		if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'):
@ -55,10 +58,29 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
 				waveform, sample_rate = torchaudio.load(inpath)
 				qnt = valle_quantize(waveform, sr=sample_rate, device=device)
 				if cfg.inference.audio_backend == "dac":
-					qnt.save(_replace_file_extension(outpath, audio_extension))
+					np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
 						"codes": qnt.codes.numpy().astype(np.uint16),
 						"metadata": {
 							"original_length": qnt.original_length,
 							"sample_rate": qnt.sample_rate,
 							"input_db": qnt.input_db.numpy().astype(np.float32),
 							"chunk_length": qnt.chunk_length,
 							"channels": qnt.channels,
 							"padding": qnt.padding,
 							"dac_version": "1.0.0",
 						},
 					})
 				else:
-					torch.save( qnt, _replace_file_extension(outpath, audio_extension) )
+					np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
 						"codes": qnt.numpy().astype(np.uint16),
 						"metadata": {
 							"original_length": waveform.shape[-1],
 							"sample_rate": sample_rate,
 						},
 					})
 			continue
@ -91,7 +113,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
 			fname = filename.replace(f'.{extension}', "")
 			waveform, sample_rate = None, None
-			language = metadata[filename]["language"] if "language" in metadata[filename] else "english"
+			language = metadata[filename]["language"] if "language" in metadata[filename] else "en"
 			if len(metadata[filename]["segments"]) == 0 or not use_slices:
 				outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}.{extension}')
@ -100,86 +122,101 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
 				if len(text) == 0:
 					continue
-				if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists():
+				if _replace_file_extension(outpath, audio_extension).exists():
 					continue
-				if not _replace_file_extension(outpath, ".json").exists():
+				if waveform is None:
-					txts.append((
+					waveform, sample_rate = torchaudio.load(inpath)
-						outpath,
+					if waveform.shape[0] > 1:
-						text,
+						waveform = torch.mean(waveform, dim=0, keepdim=True)
 						language,
 					))
 				if not _replace_file_extension(outpath, audio_extension).exists():
 					if waveform is None:
 						waveform, sample_rate = torchaudio.load(inpath)
-					wavs.append((
+				wavs.append((
-						outpath,
+					outpath,
-						waveform,
+					text,
-						sample_rate
+					language,
-					))
+					waveform,
 					sample_rate
 				))
 			else:
 				i = 0
 				for segment in metadata[filename]["segments"]:
 					id = pad(i, 4)
 					i = i + 1
 					outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}')
-					if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists():
+					outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}')
 					text = metadata[filename]["text"]
 					if len(text) == 0:
 						continue
-					if not _replace_file_extension(outpath, ".json").exists():
+					if _replace_file_extension(outpath, audio_extension).exists():
-						txts.append((
+						continue
 							outpath,
 							segment["text"],
 							language,
 						))
 					if not _replace_file_extension(outpath, audio_extension).exists():
 						if waveform is None:
 							waveform, sample_rate = torchaudio.load(inpath)
-						start = int(segment['start'] * sample_rate)
+					if waveform is None:
-						end = int(segment['end'] * sample_rate)
+						waveform, sample_rate = torchaudio.load(inpath)
 						if waveform.shape[0] > 1:
 							waveform = torch.mean(waveform, dim=0, keepdim=True)
-						if start < 0:
+					start = int(segment['start'] * sample_rate)
-							start = 0
+					end = int(segment['end'] * sample_rate)
 						if end >= waveform.shape[-1]:
 							end = waveform.shape[-1] - 1
-						if end - start < 0:
+					if start < 0:
-							continue
+						start = 0
 					if end >= waveform.shape[-1]:
 						end = waveform.shape[-1] - 1
-						wavs.append((
+					if end - start < 0:
-							outpath,
+						continue
 							waveform[:, start:end],
 							sample_rate
 						))
-		if len(txts) > 0:
+					wavs.append((
-			for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}", disable=True):
+						outpath,
-				outpath, text, language = job
+						text,
-				phones = valle_phonemize(text)
+						language,
-				data = {
+						waveform[:, start:end],
-					"text": text.strip(),
+						sample_rate
-					"phonemes": phones,
+					))
 					"language": language,
 				}
 				open(_replace_file_extension(outpath, ".json"), 'w', encoding='utf-8').write(json.dumps(data))
 		if len(wavs) > 0:
 			for job in tqdm(wavs, desc=f"Quantizing: {speaker_id}"):
 				try:
-					outpath, waveform, sample_rate = job
+					outpath, text, language, waveform, sample_rate = job
 					phones = valle_phonemize(text)
 					qnt = valle_quantize(waveform, sr=sample_rate, device=device)
 					if cfg.inference.audio_backend == "dac":
-						qnt.save(_replace_file_extension(outpath, audio_extension))
+						np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
 							"codes": qnt.codes.numpy().astype(np.uint16),
 							"metadata": {
 								"original_length": qnt.original_length,
 								"sample_rate": qnt.sample_rate,
 								"input_db": qnt.input_db.numpy().astype(np.float32),
 								"chunk_length": qnt.chunk_length,
 								"channels": qnt.channels,
 								"padding": qnt.padding,
 								"dac_version": "1.0.0",
 								"text": text.strip(),
 								"phonemes": "".join(phones),
 								"language": language,
 							},
 						})
 					else:
-						torch.save( qnt, _replace_file_extension(outpath, audio_extension) )
+						np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), {
 							"codes": qnt.numpy().astype(np.uint16),
 							"metadata": {
 								"original_length": waveform.shape[-1],
 								"sample_rate": sample_rate,
 								"text": text.strip(),
 								"phonemes": "".join(phones),
 								"language": language,
 							},
 						})
 				except Exception as e:
 					print(f"Failed to quantize: {outpath}:", e)
 					continue
 open("./missing.json", 'w', encoding='utf-8').write(json.dumps(missing))
-open("./dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset))
+open("./dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset))
--- a/scripts/train_tokenizer.py
+++ b/scripts/train_tokenizer.py
@ -38,20 +38,23 @@ else:
 				metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/{id}')
 				metadata = json.loads(open(metadata_path, "r", encoding="utf-8").read())
 				if "phonemes" not in metadata:
 					continue
 				tokenizer_data.append( f'{"".join(metadata["phonemes"])}' )
 	open(output_file, 'w', encoding='utf-8').write(json.dumps(tokenizer_data))
 unk_token = "<unk>"
-spl_tokens = ["<bos>", "</eos>", unk_token, "<mask>"]
+spl_tokens = [unk_token, "<bos>", "</eos>", "<mask>", "<space>"]
 trainer = BpeTrainer(special_tokens = spl_tokens, vocab_size = 256)
 tokenizer = Tokenizer(BPE(unk_token = unk_token))
-tokenizer.pre_tokenizer = Whitespace()
+tokenizer.pre_tokenizer = Whitespace() # takes 2 hours to process without this, we'll just manually add spaces as a token
 tokenizer.post_processor = TemplateProcessing(
    single="<bos> $A <eos>",
    special_tokens=[("<bos>", 1), ("<eos>", 2)],
 )
 tokenizer.train_from_iterator(tokenizer_data, trainer=trainer)
-tokenizer.save("./training/tokenizer.json")
+tokenizer.save("./training/tokenizer_training_data.json")
--- a/vall_e/data.py
+++ b/vall_e/data.py
@ -86,19 +86,15 @@ def _calculate_durations( type="training" ):
 def _load_paths(dataset, type="training"):
 	return { cfg.get_spkr( cfg.data_dir / data_dir / "dummy" ): _load_paths_from_metadata( data_dir, type=type, validate=cfg.dataset.validate and type == "training" ) for data_dir in tqdm(dataset, desc=f"Parsing dataset: {type}") }
-def _load_paths_from_metadata(dataset_name, type="training", validate=False):
+def _load_paths_from_metadata(group_name, type="training", validate=False):
-	data_dir = dataset_name if cfg.dataset.use_hdf5 else cfg.data_dir / dataset_name
+	data_dir = group_name if cfg.dataset.use_hdf5 else cfg.data_dir / group_name
 	_fn = _get_hdf5_paths if cfg.dataset.use_hdf5 else _get_paths_of_extensions
-	def key( id ):
+	def key( id, entry=None ):
-		if not cfg.dataset.use_hdf5:
+		return f"/{type}/{_get_hdf5_path(data_dir)}/{id}" if cfg.dataset.use_hdf5 else data_dir / id
 			return data_dir / id
-		return f"/{type}/{_get_hdf5_path(data_dir)}/{id}" 
+	metadata_path = cfg.metadata_dir / f'{group_name}.json'
 	metadata_path = cfg.metadata_dir / f'{dataset_name}.json'
 	metadata = {}
 	if cfg.dataset.use_metadata and metadata_path.exists():
@ -107,10 +103,7 @@ def _load_paths_from_metadata(dataset_name, type="training", validate=False):
 	if len(metadata) == 0:
 		return _fn( data_dir, type if cfg.dataset.use_hdf5 else _get_quant_extension(), validate )
-
+	def _validate( id, entry ):
 	def _validate( id ):
 		entry = metadata[id]
 		phones = entry['phones'] if "phones" in entry else 0
 		duration = entry['duration'] if "duration" in entry else 0
 		if type not in _total_durations:
@ -118,14 +111,16 @@ def _load_paths_from_metadata(dataset_name, type="training", validate=False):
 		_total_durations[type] += duration
 		"""
 		if cfg.dataset.use_hdf5:
 			k = key( id )
 			if k not in cfg.hdf5 or "audio" not in cfg.hdf5[k] or "text" not in cfg.hdf5[k]:
 				return False
 		"""
-		return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
+		return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration #and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
-	return [ key(id) for id in metadata.keys() if not validate or _validate(id) ]
+	return [ key(id, entry) for id, entry in metadata.items() if not validate or _validate(id, entry) ]
 def _get_hdf5_path(path):
@ -136,16 +131,16 @@ def _get_hdf5_path(path):
 def _get_hdf5_paths( data_dir, type="training", validate=False ):
 	data_dir = str(data_dir)
-	def _validate( child ):
+	def _validate( id, entry ):
-		phones = child.attrs['phonemes']
+		phones = entry.attrs['phonemes']
-		duration = child.attrs['duration']
+		duration = entry.attrs['duration']
 		if type not in _total_durations:
 			_total_durations[type] = 0
-		_total_durations[type] += child.attrs['duration']
+		_total_durations[type] += entry.attrs['duration']
-		return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
+		return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration #and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
 	key = f"/{type}/{_get_hdf5_path(data_dir)}"
-	return [ Path(f"{key}/{child.attrs['id']}") for child in cfg.hdf5[key].values() if not validate or _validate(child) ] if key in cfg.hdf5 else []
+	return [ Path(f"{key}/{id}") for id, entry in cfg.hdf5[key].items() if not validate or _validate(id, entry) ] if key in cfg.hdf5 else []
 def _get_paths_of_extensions( path, extensions=_get_quant_extension(), validate=False ):
 	if isinstance(path, str):
@ -807,47 +802,30 @@ def create_dataset_metadata( skip_existing=True ):
 				if id not in metadata:
 					metadata[id] = {}
-				# audio
+				utterance_metadata = {}
 				if audios:
-					if _get_quant_extension() == ".dac":
+					# ideally we'll encode Encodec-based audio in a similar manner because np has smaller files than pt
-						dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
+					dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
-						qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
+					qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
-						duration = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
+					if "text" in dac["metadata"]:
-						metadata[id]["metadata"] = {
+						utterance_metadata["text"] = dac["metadata"]["text"]
-							"original_length": dac["metadata"]["original_length"],
+					if "phonemes" in dac["metadata"]:
-							"sample_rate": dac["metadata"]["sample_rate"],
+						utterance_metadata["phonemes"] = dac["metadata"]["phonemes"]
-						}
+					if "language" in dac["metadata"]:
-					else:
+						utterance_metadata["language"] = dac["metadata"]["language"]
-						qnt = torch.load(f'{root}/{name}/{id}{_get_quant_extension()}')[0].t()
+					if "original_length" in dac["metadata"] and "sample_rate" in dac["metadata"]:
-						duration = qnt.shape[0] / cfg.dataset.frames_per_second
+						utterance_metadata["duration"] = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
 					metadata[id]["duration"] = duration
 				else:
 					metadata[id]["duration"] = 0
 				# text
 				if texts:
-					if _get_phone_extension() == ".json":
+					if not utterance_metadata:
-						json_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
+						utterance_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
 						content = json_metadata["phonemes"]
 						txt = json_metadata["text"]
 						lang = json_metadata["language"][:2]
 					else:
 						content = open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read().split(" ")
 						txt = ""
 						lang = "en"
-					phn = cfg.tokenizer.encode("".join(content))
+				for k, v in utterance_metadata.items():
-					phn = np.array(phn).astype(np.uint8) 
+					metadata[id][k] = v
 					metadata[id]["phones"] = len(phn)
 					metadata[id]["transcription"] = txt
 					metadata[id]["language"] = lang
 			except Exception as e:
-				#raise e
+				tqdm.write(f'Error while processing {id}: {e}')
 				print(id, e)
 				#pass
 		with open(str(metadata_path), "w", encoding="utf-8") as f:
 			f.write( json.dumps( metadata ) )
@ -900,84 +878,68 @@ def create_dataset_hdf5( skip_existing=True ):
 		for id in tqdm(ids, desc=f"Processing {name}"):
 			try:
-				audio_exists = os.path.exists(f'{root}/{name}/{id}{_get_quant_extension()}') if audios else True
+				audio_exists = os.path.exists(f'{root}/{name}/{id}{_get_quant_extension()}')
-				text_exists = os.path.exists(f'{root}/{name}/{id}{_get_phone_extension()}') if texts else True
+				text_exists = os.path.exists(f'{root}/{name}/{id}{_get_phone_extension()}') if type != "Noise" else True
-				if not audio_exists or not text_exists:
+				if not audio_exists:
 					continue
 				key = f'{type}/{speaker_name}/{id}'
 				"""
 				if skip_existing and key in hf:
 					continue
 				"""
 				group = hf.create_group(key) if key not in hf else hf[key]
 				"""
 				group.attrs['id'] = id
 				group.attrs['type'] = type
 				group.attrs['speaker'] = speaker_name
 				"""
 				if id not in metadata:
 					metadata[id] = {}
 				utterance_metadata = {}
 				# audio
 				if audios:
-					if _get_quant_extension() == ".dac":
+					# ideally we'll encode Encodec-based audio in a similar manner because np has smaller files than pt
-						dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
+					dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
-						qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
+					qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16)
-						duration = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
+					if "text" in dac["metadata"]:
-						metadata[id]["metadata"] = {
+						utterance_metadata["text"] = dac["metadata"]["text"]
-							"original_length": dac["metadata"]["original_length"],
+					if "phonemes" in dac["metadata"]:
-							"sample_rate": dac["metadata"]["sample_rate"],
+						utterance_metadata["phonemes"] = dac["metadata"]["phonemes"]
-						}
+					if "language" in dac["metadata"]:
-					else:
+						utterance_metadata["language"] = dac["metadata"]["language"]
-						qnt = torch.load(f'{root}/{name}/{id}{_get_quant_extension()}')[0].t()
+					if "original_length" in dac["metadata"] and "sample_rate" in dac["metadata"]:
-						duration = qnt.shape[0] / cfg.dataset.frames_per_second
+						utterance_metadata["duration"] = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"]
 					qnt = qnt.numpy().astype(np.int16)
 					if "audio" not in group:
-						group.create_dataset('audio', data=qnt, compression='lzf')
+						group.create_dataset('audio', data=qnt.numpy().astype(np.int16), compression='lzf')
 					group.attrs['duration'] = duration
 					metadata[id]["duration"] = duration
 				else:
 					group.attrs['duration'] = 0
 					metadata[id]["duration"] = 0
 				# text
 				if texts:
-					if _get_phone_extension() == ".json":
+					if not utterance_metadata and text_exists:
-						json_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
+						utterance_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
 						content = json_metadata["phonemes"]
 						txt = json_metadata["text"]
 						lang = json_metadata["language"][:2]
 					else:
 						content = open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read().split(" ")
 						txt = ""
 						lang = "en"
-					phn = cfg.tokenizer.encode("".join(content))
+					phn = "".join(utterance_metadata["phonemes"])
 					phn = cfg.tokenizer.encode(phn)
 					phn = np.array(phn).astype(np.uint8) 
 					if "text" not in group:
 						group.create_dataset('text', data=phn, compression='lzf')
-					group.attrs['phonemes'] = len(phn)
+				for k, v in utterance_metadata.items():
-					group.attrs['transcription'] = txt
+					group.attrs[k] = v
-					group.attrs['language'] = lang
+					metadata[id][k] = v
 					metadata[id]["phones"] = len(phn)
 					metadata[id]["transcription"] = txt
 					metadata[id]["language"] = lang
 				else:
 					group.attrs['phonemes'] = 0
 					metadata[id]["phones"] = 0
 			except Exception as e:
-				#raise e
+				tqdm.write(f'Error while processing {id}: {e}')
 				print(id, e)
 				#pass
 		with open(str(metadata_path), "w", encoding="utf-8") as f:
 			f.write( json.dumps( metadata ) )
@ -1002,119 +964,6 @@ def create_dataset_hdf5( skip_existing=True ):
 	hf.create_dataset('symmap', data=json.dumps(symmap))
 	hf.close()
 def extract_dataset_hdf5( skip_existing=True ):
 	cfg.dataset.use_hdf5 = True
 	cfg.load_hdf5(write=False)
 	hf = cfg.hdf5
 	symmap = get_phone_symmap()
 	reverse_symmap = {"1":"<s>","2":"</s>","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"dˌ","12":"mˌ","13":"d","14":"ɹ","15":"tˈ","16":"pˌ","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"wˌ","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"hˌ","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"bˌ","75":"kˈ","76":"ɔ","77":"zˈ","78":"ᵻ","79":"kˌ","80":"vˈ","81":"fˌ","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"tˌ","86":"pˈ","87":"ðˌ","88":"sˌ","89":"nˌ","90":"lˌ","91":"̩","92":"ʔ","93":"vˌ","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"jˌ","101":"uːˈ","102":"iːˈ","103":"zˌ","104":".ˈ","105":"…","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"iˌ","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"-ˌ","127":"ɫ","128":"q","129":"—","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":".ˌ","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"?ˌ","150":",ˌ","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"!ˌ","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"qˌ","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"”","180":"“","181":"“ˈ","182":"“ˌ","183":";ˈ","184":";ˌ","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"ᵝ","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"oˌ","221":"eˈ","222":"ʍ","223":"eˌ","224":"uˌ","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"}
 	root = str(cfg.data_dir)
 	def add( type="training", audios=True, texts=True ):
 		for group in tqdm( hf[f'{type}/data/'].keys(), desc=f"Processing {type}"):
 			for name in tqdm( hf[f'{type}/data/{group}'].keys(), desc=f"Processing {group}"):
 				(cfg.data_dir / group / name).mkdir(parents=True, exist_ok=True)
 				for id in tqdm( hf[f'{type}/data/{group}/{name}'].keys(), desc=f"Processing {name}"):
 					try:
 						key = f'{type}/data/{group}/{name}/{id}'
 						if key not in hf:
 							tqdm.write(f'Missing key: {key}')
 							continue
 						audio_exists = "audio" in hf[key]
 						text_exists = "text" in hf[key]
 						if not audio_exists or not text_exists:
 							tqdm.write(f'Missing audio/text: {key}')
 							continue
 						audio_path = Path(f'{root}/{group}/{name}/{id}.enc')
 						text_path = Path(f'{root}/{group}/{name}/{id}.json')
 						# audio
 						if audios and audio_exists and not audio_path.exists():
 							qnt = hf[key]["audio"][:, :]
 							torch.save( qnt, audio_path )
 						# text
 						if texts and text_exists and not text_path.exists():
 							tokens = hf[key]["text"][:][1:-1]
 							phones = [ reverse_symmap[f'{token}'] for token in tokens ]
 							phones = list("".join(phones).replace("  ", " "))
 							j = {
 								"text": "",
 								"phonemes": phones,
 								"language": "en"
 							}
 							with open(text_path, "w", encoding="utf-8") as f:
 								f.write( json.dumps( j ) )
 					except Exception as e:
 						raise e
 	add( type="training" )
 	add( type="validation" )
 	add( type="noise", texts=False )
 	hf.close()
 def retokenize_dataset_hdf5( skip_existing=True ):
 	cfg.dataset.use_hdf5 = True
 	cfg.load_hdf5(write=True)
 	hf = cfg.hdf5
 	symmap = get_phone_symmap()
 	reverse_symmap = {"1":"<s>","2":"</s>","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"dˌ","12":"mˌ","13":"d","14":"ɹ","15":"tˈ","16":"pˌ","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"wˌ","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"hˌ","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"bˌ","75":"kˈ","76":"ɔ","77":"zˈ","78":"ᵻ","79":"kˌ","80":"vˈ","81":"fˌ","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"tˌ","86":"pˈ","87":"ðˌ","88":"sˌ","89":"nˌ","90":"lˌ","91":"̩","92":"ʔ","93":"vˌ","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"jˌ","101":"uːˈ","102":"iːˈ","103":"zˌ","104":".ˈ","105":"…","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"iˌ","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"-ˌ","127":"ɫ","128":"q","129":"—","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":".ˌ","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"?ˌ","150":",ˌ","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"!ˌ","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"qˌ","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"”","180":"“","181":"“ˈ","182":"“ˌ","183":";ˈ","184":";ˌ","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"ᵝ","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"oˌ","221":"eˈ","222":"ʍ","223":"eˌ","224":"uˌ","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"}
 	root = str(cfg.data_dir)
 	def add( type="training" ):
 		for group in tqdm( hf[f'{type}/data/'].keys(), desc=f"Processing {type}"):
 			for name in tqdm( hf[f'{type}/data/{group}'].keys(), desc=f"Processing {group}"):
 				(cfg.data_dir / group / name).mkdir(parents=True, exist_ok=True)
 				for id in tqdm( hf[f'{type}/data/{group}/{name}'].keys(), desc=f"Processing {name}"):
 					try:
 						key = f'{type}/data/{group}/{name}/{id}'
 						if key not in hf:
 							tqdm.write(f'Missing key: {key}')
 							continue
 						if "text" not in hf[key]:
 							tqdm.write(f'Missing text: {key}')
 							continue
 						# text
 						tokens = hf[key]["text"][:][1:-1]
 						content = list("".join([ reverse_symmap[f'{token}'] for token in tokens ]).replace("  ", " "))
 						tokens = cfg.tokenizer.encode("".join(content))
 						tokens = np.array(tokens).astype(np.uint8) 
 						del hf[key]['text']
 						hf[key].create_dataset('text', data=tokens, compression='lzf')
 					except Exception as e:
 						raise e
 	add( type="training" )
 	add( type="validation" )
 	# write symmap
 	if "symmap" in hf:
 		del hf['symmap']
 	hf.create_dataset('symmap', data=json.dumps(symmap))
 	hf.close()
 if __name__ == "__main__":
 	import argparse
@ -1135,10 +984,6 @@ if __name__ == "__main__":
 	if args.action == "hdf5":
 		create_dataset_hdf5()
 	if args.action == "extract-hdf5":
 		extract_dataset_hdf5()
 	if args.action == "retokenize-hdf5":
 		retokenize_dataset_hdf5()
 	elif args.action == "list-dataset":
 		dataset = []
 		for group in os.listdir(cfg.data_dir):
@ -1147,7 +992,7 @@ if __name__ == "__main__":
 					continue
 				dataset.append(f'{group}/{name}')
-		print(dataset)
+		print(json.dumps(dataset))
 	elif args.action == "metadata":
 		create_dataset_metadata()
 	elif args.action == "sample":