From 1a8c5de5172df2faaa7b134f6240521cc574e381 Mon Sep 17 00:00:00 2001 From: mrq Date: Thu, 16 Mar 2023 14:59:12 +0000 Subject: [PATCH] unk hunting --- models/tokenizers/ipa.json | 21 +++++++++++++++++++-- src/utils.py | 2 +- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/models/tokenizers/ipa.json b/models/tokenizers/ipa.json index 498c860..598b373 100755 --- a/models/tokenizers/ipa.json +++ b/models/tokenizers/ipa.json @@ -106,7 +106,18 @@ "ʌ": 57, "ʒ": 58, "θ": 59, - "ː": 60 + "ɐ": 60, + "ɜ": 61, + "ᵻ": 62, + "ɾ": 63, + "n̩": 64, + "ː": 65, + "ɔː": 66, + "uː": 67, + "iː": 68, + "ɑː": 69, + "oː": 70, + "ɜː": 71 }, "merges": [ @@ -116,7 +127,13 @@ "e ɪ", "o ʊ", "t͡ ʃ", - "ɔ ɪ" + "ɔ ɪ", + "ɔ ː", + "u ː", + "i ː", + "ɑ ː", + "o ː", + "ɜ ː" ] } } \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index 789b4a3..03b5252 100755 --- a/src/utils.py +++ b/src/utils.py @@ -1781,7 +1781,7 @@ def tokenize_text( text ): load_tts() encoded = tts.tokenizer.encode(text) - decoded = tts.tokenizer.tokenizer.decode(encoded, skip_special_tokens=False).replace(" ", "") + decoded = tts.tokenizer.tokenizer.decode(encoded, skip_special_tokens=False).replace(" ", "").replace("[SPACE]", " ") return "\n".join([ str(encoded), decoded ])