From 6a11bc9cb6be603f7407a18d7e0a5f28ef150ab7 Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Mon, 29 Apr 2024 09:09:26 -0500
Subject: [PATCH] update tokenizer because, for some reason, it had the wrong
 order for the special tokens to where eos = unk

---
 data/tokenizer.json | 551 ++++++++++++++++++++++++++++++++++++++++++++
 vall_e/data.py      |  10 +-
 2 files changed, 558 insertions(+), 3 deletions(-)
 create mode 100644 data/tokenizer.json
diff --git a/data/tokenizer.json b/data/tokenizer.json
new file mode 100644
index 0000000..f034fc9
--- /dev/null
+++ b/data/tokenizer.json
@@ -0,0 +1,551 @@
+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "</eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Whitespace"
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "<bos>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<eos>",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "<bos>": {
+        "id": "<bos>",
+        "ids": [
+          1
+        ],
+        "tokens": [
+          "<bos>"
+        ]
+      },
+      "<eos>": {
+        "id": "<eos>",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "<eos>"
+        ]
+      }
+    }
+  },
+  "decoder": null,
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": "<unk>",
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "vocab": {
+      "<unk>": 0,
+      "<bos>": 1,
+      "</eos>": 2,
+      "<mask>": 3,
+      "!": 4,
+      "\"": 5,
+      "(": 6,
+      ")": 7,
+      ",": 8,
+      "-": 9,
+      ".": 10,
+      "1": 11,
+      ":": 12,
+      "?": 13,
+      "a": 14,
+      "b": 15,
+      "c": 16,
+      "d": 17,
+      "e": 18,
+      "f": 19,
+      "h": 20,
+      "i": 21,
+      "j": 22,
+      "k": 23,
+      "l": 24,
+      "m": 25,
+      "n": 26,
+      "o": 27,
+      "p": 28,
+      "q": 29,
+      "r": 30,
+      "s": 31,
+      "t": 32,
+      "u": 33,
+      "v": 34,
+      "w": 35,
+      "x": 36,
+      "z": 37,
+      "¡": 38,
+      "«": 39,
+      "»": 40,
+      "¿": 41,
+      "æ": 42,
+      "ç": 43,
+      "ð": 44,
+      "ŋ": 45,
+      "ɐ": 46,
+      "ɑ": 47,
+      "ɔ": 48,
+      "ɕ": 49,
+      "ə": 50,
+      "ɚ": 51,
+      "ɛ": 52,
+      "ɜ": 53,
+      "ɟ": 54,
+      "ɡ": 55,
+      "ɪ": 56,
+      "ɬ": 57,
+      "ɯ": 58,
+      "ɹ": 59,
+      "ɾ": 60,
+      "ʃ": 61,
+      "ʈ": 62,
+      "ʊ": 63,
+      "ʋ": 64,
+      "ʌ": 65,
+      "ʑ": 66,
+      "ʒ": 67,
+      "ʔ": 68,
+      "ʲ": 69,
+      "ˈ": 70,
+      "ˌ": 71,
+      "ː": 72,
+      "̃": 73,
+      "̩": 74,
+      "θ": 75,
+      "ᵻ": 76,
+      "—": 77,
+      "…": 78,
+      "ˈɛ": 79,
+      "iː": 80,
+      "aɪ": 81,
+      "nd": 82,
+      "ˈɪ": 83,
+      "eɪ": 84,
+      "ˈæ": 85,
+      "oʊ": 86,
+      "ðə": 87,
+      "ɑː": 88,
+      "ˈeɪ": 89,
+      "ən": 90,
+      "uː": 91,
+      "ˈʌ": 92,
+      "ˈaɪ": 93,
+      "st": 94,
+      "ˈɔ": 95,
+      "ˈoʊ": 96,
+      "ˈiː": 97,
+      "ˈɑː": 98,
+      "ænd": 99,
+      "ːɹ": 100,
+      "ɪŋ": 101,
+      "ɜː": 102,
+      "ɪn": 103,
+      "tə": 104,
+      "ʌv": 105,
+      "aʊ": 106,
+      "əl": 107,
+      "ˈuː": 108,
+      "tʃ": 109,
+      "ɪz": 110,
+      "ˈɜː": 111,
+      "ˌʌ": 112,
+      "æt": 113,
+      "dʒ": 114,
+      "ˈɔː": 115,
+      "ɪt": 116,
+      "ˈaʊ": 117,
+      "ɚɹ": 118,
+      "ˈɛn": 119,
+      "wʌ": 120,
+      "li": 121,
+      "hiː": 122,
+      "ˌɛ": 123,
+      "wɪ": 124,
+      "ðæt": 125,
+      "wʌz": 126,
+      "juː": 127,
+      "oːɹ": 128,
+      "ðɪ": 129,
+      "sˈɛ": 130,
+      "ˈɑːɹ": 131,
+      "ˌɪ": 132,
+      "nt": 133,
+      "ˈʊ": 134,
+      "ənt": 135,
+      "hɪz": 136,
+      "hæ": 137,
+      "ˌɑː": 138,
+      "ɔːɹ": 139,
+      "ˈɛɹ": 140,
+      "wɪð": 141,
+      "ᵻd": 142,
+      "ˈoːɹ": 143,
+      "pɹ": 144,
+      "ˈɔːl": 145,
+      "mˌ": 146,
+      "ʃən": 147,
+      "kt": 148,
+      "ˌoʊ": 149,
+      "ˈɔːɹ": 150,
+      "fɹ": 151,
+      "æz": 152,
+      "ʃiː": 153,
+      "ˌʌt": 154,
+      "ˈɛl": 155,
+      "ˌaʊ": 156,
+      "ˈʌn": 157,
+      "əs": 158,
+      "hɜː": 159,
+      "lˈaɪ": 160,
+      "ˈæn": 161,
+      "ˈɪɹ": 162,
+      "ʊd": 163,
+      "ɹᵻ": 164,
+      "ld": 165,
+      "bˌʌt": 166,
+      "ks": 167,
+      "nˈoʊ": 168,
+      "ɾɚ": 169,
+      "hæd": 170,
+      "ɛɹ": 171,
+      "ˈɪŋ": 172,
+      "ɡɹ": 173,
+      "ɔn": 174,
+      "nˌɑː": 175,
+      "maɪ": 176,
+      "vɚ": 177,
+      "fɔːɹ": 178,
+      "ðɚ": 179,
+      "tʊ": 180,
+      "ðɛɹ": 181,
+      "ˈʌm": 182,
+      "nˌɑːt": 183,
+      "tɹ": 184,
+      "sˈiː": 185,
+      "ʌvðə": 186,
+      "mˈɪ": 187,
+      "ˈæp": 188,
+      "ˌɪm": 189,
+      "ɪk": 190,
+      "sp": 191,
+      "lˈeɪ": 192,
+      "hˌɪm": 193,
+      "ɐn": 194,
+      "ðeɪ": 195,
+      "lˈɪ": 196,
+      "ɾi": 197,
+      "bɹ": 198,
+      "lˈɛ": 199,
+      "kɹ": 200,
+      "ˈɪl": 201,
+      "jˈuː": 202,
+      "ʌm": 203,
+      "mˌiː": 204,
+      "lˈæ": 205,
+      "ˌɪn": 206,
+      "bᵻ": 207,
+      "wˈʌn": 208,
+      "ˈɪn": 209,
+      "ˈoʊn": 210,
+      "biː": 211,
+      "sˈɛd": 212,
+      "ˈɛd": 213,
+      "ˈaɪt": 214,
+      "fɹʌm": 215,
+      "baɪ": 216,
+      "ɪs": 217,
+      "ɚz": 218,
+      "ðɪs": 219,
+      "əns": 220,
+      "ɪf": 221,
+      "bəl": 222,
+      "ˈænd": 223,
+      "ɪnðə": 224,
+      "əm": 225,
+      "iːz": 226,
+      "ˌuː": 227,
+      "ᵻz": 228,
+      "wˈeɪ": 229,
+      "ft": 230,
+      "wiː": 231,
+      "lˈiː": 232,
+      "stɹ": 233,
+      "jʊ": 234,
+      "ɚd": 235,
+      "ˌaɪ": 236,
+      "kw": 237,
+      "ˌɔn": 238,
+      "ˈaɪd": 239,
+      "ts": 240,
+      "ɪm": 241,
+      "ˈʌst": 242,
+      "ˈoʊld": 243,
+      "ˌɪtʃ": 244,
+      "dˈɪ": 245,
+      "sˌoʊ": 246,
+      "ɑːɹ": 247,
+      "hɐ": 248,
+      "sˈeɪ": 249,
+      "ɾᵻd": 250,
+      "dᵻ": 251,
+      "wˌɪtʃ": 252,
+      "sˈɛl": 253,
+      "ɹi": 254,
+      "ˈʌðɚ": 255
+    },
+    "merges": [
+      "ˈ ɛ",
+      "i ː",
+      "a ɪ",
+      "n d",
+      "ˈ ɪ",
+      "e ɪ",
+      "ˈ æ",
+      "o ʊ",
+      "ð ə",
+      "ɑ ː",
+      "ˈ eɪ",
+      "ə n",
+      "u ː",
+      "ˈ ʌ",
+      "ˈ aɪ",
+      "s t",
+      "ˈ ɔ",
+      "ˈ oʊ",
+      "ˈ iː",
+      "ˈ ɑː",
+      "æ nd",
+      "ː ɹ",
+      "ɪ ŋ",
+      "ɜ ː",
+      "ɪ n",
+      "t ə",
+      "ʌ v",
+      "a ʊ",
+      "ə l",
+      "ˈ uː",
+      "t ʃ",
+      "ɪ z",
+      "ˈ ɜː",
+      "ˌ ʌ",
+      "æ t",
+      "d ʒ",
+      "ˈɔ ː",
+      "ɪ t",
+      "ˈ aʊ",
+      "ɚ ɹ",
+      "ˈɛ n",
+      "w ʌ",
+      "l i",
+      "h iː",
+      "ˌ ɛ",
+      "w ɪ",
+      "ð æt",
+      "wʌ z",
+      "j uː",
+      "o ːɹ",
+      "ð ɪ",
+      "s ˈɛ",
+      "ˈɑː ɹ",
+      "ˌ ɪ",
+      "n t",
+      "ˈ ʊ",
+      "ən t",
+      "h ɪz",
+      "h æ",
+      "ˌ ɑː",
+      "ɔ ːɹ",
+      "ˈɛ ɹ",
+      "wɪ ð",
+      "ᵻ d",
+      "ˈ oːɹ",
+      "p ɹ",
+      "ˈɔː l",
+      "m ˌ",
+      "ʃ ən",
+      "k t",
+      "ˌ oʊ",
+      "ˈɔ ːɹ",
+      "f ɹ",
+      "æ z",
+      "ʃ iː",
+      "ˌʌ t",
+      "ˈɛ l",
+      "ˌ aʊ",
+      "ˈʌ n",
+      "ə s",
+      "h ɜː",
+      "l ˈaɪ",
+      "ˈæ n",
+      "ˈɪ ɹ",
+      "ʊ d",
+      "ɹ ᵻ",
+      "l d",
+      "b ˌʌt",
+      "k s",
+      "n ˈoʊ",
+      "ɾ ɚ",
+      "hæ d",
+      "ɛ ɹ",
+      "ˈɪ ŋ",
+      "ɡ ɹ",
+      "ɔ n",
+      "n ˌɑː",
+      "m aɪ",
+      "v ɚ",
+      "f ɔːɹ",
+      "ð ɚ",
+      "t ʊ",
+      "ð ɛɹ",
+      "ˈʌ m",
+      "nˌɑː t",
+      "t ɹ",
+      "s ˈiː",
+      "ʌv ðə",
+      "m ˈɪ",
+      "ˈæ p",
+      "ˌɪ m",
+      "ɪ k",
+      "s p",
+      "l ˈeɪ",
+      "h ˌɪm",
+      "ɐ n",
+      "ð eɪ",
+      "l ˈɪ",
+      "ɾ i",
+      "b ɹ",
+      "l ˈɛ",
+      "k ɹ",
+      "ˈɪ l",
+      "j ˈuː",
+      "ʌ m",
+      "mˌ iː",
+      "l ˈæ",
+      "ˌ ɪn",
+      "b ᵻ",
+      "w ˈʌn",
+      "ˈɪ n",
+      "ˈoʊ n",
+      "b iː",
+      "sˈɛ d",
+      "ˈɛ d",
+      "ˈaɪ t",
+      "fɹ ʌm",
+      "b aɪ",
+      "ɪ s",
+      "ɚ z",
+      "ðɪ s",
+      "ən s",
+      "ɪ f",
+      "b əl",
+      "ˈæ nd",
+      "ɪn ðə",
+      "ə m",
+      "iː z",
+      "ˌ uː",
+      "ᵻ z",
+      "w ˈeɪ",
+      "f t",
+      "w iː",
+      "l ˈiː",
+      "st ɹ",
+      "j ʊ",
+      "ɚ d",
+      "ˌ aɪ",
+      "k w",
+      "ˌ ɔn",
+      "ˈaɪ d",
+      "t s",
+      "ɪ m",
+      "ˈʌ st",
+      "ˈoʊ ld",
+      "ˌɪ tʃ",
+      "d ˈɪ",
+      "s ˌoʊ",
+      "ɑː ɹ",
+      "h ɐ",
+      "s ˈeɪ",
+      "ɾ ᵻd",
+      "d ᵻ",
+      "w ˌɪtʃ",
+      "sˈɛ l",
+      "ɹ i",
+      "ˈʌ ðɚ"
+    ]
+  }
+}
\ No newline at end of file
diff --git a/vall_e/data.py b/vall_e/data.py
index ca2a923..40d54fc 100755
--- a/vall_e/data.py
+++ b/vall_e/data.py
@@ -753,7 +753,7 @@ def create_train_val_dataloader():
 	return train_dl, subtrain_dl, val_dl
 
 # parse dataset into better to sample metadata
-def create_dataset_metadata( skip_existing=False ):
+def create_dataset_metadata( skip_existing=True ):
 	symmap = get_phone_symmap()
 	
 	root = str(cfg.data_dir)
@@ -828,7 +828,8 @@ def create_dataset_metadata( skip_existing=False ):
 					metadata[id]["phones"] = len(phn)
 					metadata[id]["transcription"] = txt
 			except Exception as e:
-				raise e
+				#raise e
+				print(id, e)
 				#pass
 
 		with open(str(metadata_path), "w", encoding="utf-8") as f:
@@ -885,8 +886,10 @@ def create_dataset_hdf5( skip_existing=True ):
 
 				key = f'{type}/{name}/{id}'
 
+				"""
 				if skip_existing and key in hf:
 					continue
+				"""
 
 				group = hf.create_group(key) if key not in hf else hf[key]
 
@@ -948,7 +951,8 @@ def create_dataset_hdf5( skip_existing=True ):
 					group.attrs['phonemes'] = 0
 					metadata[id]["phones"] = 0
 			except Exception as e:
-				raise e
+				#raise e
+				print(id, e)
 				#pass
 
 		with open(str(metadata_path), "w", encoding="utf-8") as f: