From d31f27119ab634261d97f111d14762b1ec8b9187 Mon Sep 17 00:00:00 2001 From: mrq Date: Sat, 21 Sep 2024 12:29:28 -0500 Subject: [PATCH] regex replace out the (lang) markers in espeak, updated tokenizer vocab as lazily as possible to not have unk tokens --- data/tokenizer.json | 13 ++++++++++++- vall_e/data.py | 6 ++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/data/tokenizer.json b/data/tokenizer.json index 66bd5e0..892771a 100644 --- a/data/tokenizer.json +++ b/data/tokenizer.json @@ -116,12 +116,16 @@ "": 3, " ": 4, - "ᵝ": 0, + "ᵝ": 4, "!": 5, "\"": 6, "(": 7, + "{": 4, + "[": 4, ")": 8, + "}": 8, + "]": 8, ",": 9, "-": 10, ".": 11, @@ -131,6 +135,7 @@ "?": 15, "a": 16, "ä": 16, + "ɒ": 16, "b": 17, "c": 18, "d": 19, @@ -145,6 +150,7 @@ "m": 27, "n": 28, "ɴ": 28, + "ɲ": 28, "o": 29, "̞": 29, "p": 30, @@ -152,9 +158,14 @@ "q": 31, "r": 32, "ɽ": 32, + "ʁ": 32, "s": 33, "t": 34, "u": 35, + "ø": 35, + "œ": 35, + "y": 35, + "ɣ": 35, "ũ": 35, "v": 36, "w": 37, diff --git a/vall_e/data.py b/vall_e/data.py index fb2b0f2..6bea50d 100755 --- a/vall_e/data.py +++ b/vall_e/data.py @@ -3,6 +3,7 @@ import copy import h5py import json +import re import logging import numpy as np import os @@ -1370,8 +1371,9 @@ def process_artifact_metadata( artifact ): # to-do: regex replace /([a-z]{2})/ to "" if "phonemes" in metadata: metadata["phonemes"] = metadata["phonemes"].replace("(en)", "") - if "phonemes" in metadata and "language" in metadata: - metadata["phonemes"] = metadata["phonemes"].replace(f"({metadata['language']})", "") + if "language" in metadata: + metadata["phonemes"] = metadata["phonemes"].replace(f"({metadata['language']})", "") + metadata["phonemes"] = re.sub(r'\([a-z]{2}\)', "", metadata["phonemes"]) return metadata