forked from mrq/ai-voice-cloning
cleaned up some prepare dataset code
This commit is contained in:
parent
0b62ccc112
commit
d4c50967a6
|
@ -2,8 +2,11 @@
|
||||||
"version": "1.0",
|
"version": "1.0",
|
||||||
"truncation": null,
|
"truncation": null,
|
||||||
"padding": null,
|
"padding": null,
|
||||||
"added_tokens":
|
"normalizer": null,
|
||||||
[
|
"pre_tokenizer": null,
|
||||||
|
"post_processor": null,
|
||||||
|
"decoder": null,
|
||||||
|
"added_tokens": [
|
||||||
{
|
{
|
||||||
"id": 0,
|
"id": 0,
|
||||||
"special": true,
|
"special": true,
|
||||||
|
@ -32,20 +35,14 @@
|
||||||
"normalized": false
|
"normalized": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"normalizer": null,
|
"model": {
|
||||||
"pre_tokenizer": null,
|
|
||||||
"post_processor": null,
|
|
||||||
"decoder": null,
|
|
||||||
"model":
|
|
||||||
{
|
|
||||||
"type": "BPE",
|
"type": "BPE",
|
||||||
"dropout": null,
|
"dropout": null,
|
||||||
"unk_token": "[UNK]",
|
"unk_token": "[UNK]",
|
||||||
"continuing_subword_prefix": null,
|
"continuing_subword_prefix": null,
|
||||||
"end_of_word_suffix": null,
|
"end_of_word_suffix": null,
|
||||||
"fuse_unk": false,
|
"fuse_unk": false,
|
||||||
"vocab":
|
"vocab": {
|
||||||
{
|
|
||||||
"[STOP]": 0,
|
"[STOP]": 0,
|
||||||
"[UNK]": 1,
|
"[UNK]": 1,
|
||||||
"[SPACE]": 2,
|
"[SPACE]": 2,
|
||||||
|
@ -61,40 +58,39 @@
|
||||||
";": 12,
|
";": 12,
|
||||||
"?": 13,
|
"?": 13,
|
||||||
"a": 14,
|
"a": 14,
|
||||||
"aɪ": 15,
|
"b": 15,
|
||||||
"aʊ": 16,
|
"c": 16,
|
||||||
"b": 17,
|
"d": 17,
|
||||||
"d": 18,
|
"e": 18,
|
||||||
"d͡": 19,
|
"f": 19,
|
||||||
"d͡ʒ": 20,
|
"g": 20,
|
||||||
"e": 21,
|
"h": 21,
|
||||||
"eɪ": 22,
|
"i": 22,
|
||||||
"f": 23,
|
"j": 23,
|
||||||
"h": 24,
|
"k": 24,
|
||||||
"i": 25,
|
"l": 25,
|
||||||
"j": 26,
|
"m": 26,
|
||||||
"k": 27,
|
"n": 27,
|
||||||
"l": 28,
|
"o": 28,
|
||||||
"m": 29,
|
"p": 29,
|
||||||
"n": 30,
|
"q": 30,
|
||||||
"o": 31,
|
"r": 31,
|
||||||
"oʊ": 32,
|
"s": 32,
|
||||||
"p": 33,
|
"t": 33,
|
||||||
"s": 34,
|
"u": 34,
|
||||||
"t": 35,
|
"v": 35,
|
||||||
"t͡": 36,
|
"w": 36,
|
||||||
"t͡ʃ": 37,
|
"x": 37,
|
||||||
"u": 38,
|
"y": 38,
|
||||||
"v": 39,
|
"z": 39,
|
||||||
"w": 40,
|
"d͡": 41,
|
||||||
"z": 41,
|
"t͡": 42,
|
||||||
"|": 42,
|
"|": 43,
|
||||||
"æ": 43,
|
"æ": 44,
|
||||||
"ð": 44,
|
"ð": 45,
|
||||||
"ŋ": 45,
|
"ŋ": 46,
|
||||||
"ɑ": 46,
|
"ɑ": 47,
|
||||||
"ɔ": 47,
|
"ɔ": 48,
|
||||||
"ɔɪ": 48,
|
|
||||||
"ə": 49,
|
"ə": 49,
|
||||||
"ɚ": 50,
|
"ɚ": 50,
|
||||||
"ɛ": 51,
|
"ɛ": 51,
|
||||||
|
@ -112,27 +108,34 @@
|
||||||
"ɾ": 63,
|
"ɾ": 63,
|
||||||
"n̩": 64,
|
"n̩": 64,
|
||||||
"ː": 65,
|
"ː": 65,
|
||||||
"ɔː": 66,
|
"ˈ": 66,
|
||||||
"uː": 67,
|
"d͡ʒ": 67,
|
||||||
"iː": 68,
|
"aɪ": 68,
|
||||||
"ɑː": 69,
|
"aʊ": 69,
|
||||||
"oː": 70,
|
"eɪ": 70,
|
||||||
"ɜː": 71
|
"oʊ": 71,
|
||||||
|
"t͡ʃ": 72,
|
||||||
|
"ɔɪ": 73,
|
||||||
|
"ɔː": 74,
|
||||||
|
"uː": 75,
|
||||||
|
"iː": 76,
|
||||||
|
"ɑː": 77,
|
||||||
|
"oː": 78,
|
||||||
|
"ɜː": 79
|
||||||
},
|
},
|
||||||
"merges":
|
"merges": [
|
||||||
[
|
|
||||||
"a ɪ",
|
"a ɪ",
|
||||||
"a ʊ",
|
|
||||||
"d͡ ʒ",
|
|
||||||
"e ɪ",
|
"e ɪ",
|
||||||
"o ʊ",
|
|
||||||
"t͡ ʃ",
|
|
||||||
"ɔ ɪ",
|
"ɔ ɪ",
|
||||||
"ɔ ː",
|
"a ʊ",
|
||||||
"u ː",
|
"o ʊ",
|
||||||
|
"d͡ ʒ",
|
||||||
|
"t͡ ʃ",
|
||||||
"i ː",
|
"i ː",
|
||||||
"ɑ ː",
|
|
||||||
"o ː",
|
"o ː",
|
||||||
|
"u ː",
|
||||||
|
"ɑ ː",
|
||||||
|
"ɔ ː",
|
||||||
"ɜ ː"
|
"ɜ ː"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
219
src/utils.py
219
src/utils.py
|
@ -1187,7 +1187,8 @@ def transcribe_dataset( voice, language=None, skip_existings=False, progress=Non
|
||||||
# resample to the input rate, since it'll get resampled for training anyways
|
# resample to the input rate, since it'll get resampled for training anyways
|
||||||
# this should also "help" increase throughput a bit when filling the dataloaders
|
# this should also "help" increase throughput a bit when filling the dataloaders
|
||||||
waveform, sample_rate = resample(waveform, sample_rate, tts.input_sample_rate if tts is not None else 22050)
|
waveform, sample_rate = resample(waveform, sample_rate, tts.input_sample_rate if tts is not None else 22050)
|
||||||
|
if waveform.shape[0] == 2:
|
||||||
|
waveform = waveform[:1]
|
||||||
torchaudio.save(f"{indir}/audio/{basename}", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
|
torchaudio.save(f"{indir}/audio/{basename}", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
|
||||||
|
|
||||||
with open(infile, 'w', encoding="utf-8") as f:
|
with open(infile, 'w', encoding="utf-8") as f:
|
||||||
|
@ -1254,6 +1255,10 @@ def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, resul
|
||||||
messages.append(message)
|
messages.append(message)
|
||||||
continue
|
continue
|
||||||
sliced, _ = resample( sliced, sample_rate, 22050 )
|
sliced, _ = resample( sliced, sample_rate, 22050 )
|
||||||
|
|
||||||
|
if waveform.shape[0] == 2:
|
||||||
|
waveform = waveform[:1]
|
||||||
|
|
||||||
torchaudio.save(f"{indir}/audio/{file}", sliced, 22050, encoding="PCM_S", bits_per_sample=16)
|
torchaudio.save(f"{indir}/audio/{file}", sliced, 22050, encoding="PCM_S", bits_per_sample=16)
|
||||||
|
|
||||||
segments +=1
|
segments +=1
|
||||||
|
@ -1261,15 +1266,8 @@ def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, resul
|
||||||
messages.append(f"Sliced segments: {files} => {segments}.")
|
messages.append(f"Sliced segments: {files} => {segments}.")
|
||||||
return "\n".join(messages)
|
return "\n".join(messages)
|
||||||
|
|
||||||
"""
|
# takes an LJSpeech-dataset-formatted .txt file and phonemize it
|
||||||
def phonemizer( text, language="eng" ):
|
def phonemize_txt_file( path ):
|
||||||
transducer = make_g2p(language, f'{language}-ipa')
|
|
||||||
phones = transducer(text).output_string
|
|
||||||
ignored = [" "] + [ p for p in string.punctuation ]
|
|
||||||
return ["_" if p in ignored else p for p in phones]
|
|
||||||
"""
|
|
||||||
|
|
||||||
def phonemize_txt( path ):
|
|
||||||
with open(path, 'r', encoding='utf-8') as f:
|
with open(path, 'r', encoding='utf-8') as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
|
|
||||||
|
@ -1291,39 +1289,62 @@ def phonemize_txt( path ):
|
||||||
|
|
||||||
return joined
|
return joined
|
||||||
|
|
||||||
|
# takes an LJSpeech-dataset-formatted .txt (and phonemized .phn.txt from the above) and creates a JSON that should slot in as whisper.json
|
||||||
|
def create_dataset_json( path ):
|
||||||
|
with open(path, 'r', encoding='utf-8') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
phonemes = None
|
||||||
|
phn_path = path.replace(".txt", ".phn.txt")
|
||||||
|
if os.path.exists(phn_path):
|
||||||
|
with open(phn_path, 'r', encoding='utf-8') as f:
|
||||||
|
phonemes = f.readlines()
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
split = line.split("|")
|
||||||
|
audio = split[0]
|
||||||
|
text = split[1]
|
||||||
|
|
||||||
|
data[audio] = {
|
||||||
|
'text': text.strip()
|
||||||
|
}
|
||||||
|
|
||||||
|
for line in phonemes:
|
||||||
|
split = line.split("|")
|
||||||
|
audio = split[0]
|
||||||
|
text = split[1]
|
||||||
|
|
||||||
|
data[audio]['phonemes'] = text.strip()
|
||||||
|
|
||||||
|
with open(path.replace(".txt", ".json"), 'w', encoding='utf-8') as f:
|
||||||
|
f.write(json.dumps(data, indent="\t"))
|
||||||
|
|
||||||
def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, progress=gr.Progress() ):
|
def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, progress=gr.Progress() ):
|
||||||
indir = f'./training/{voice}/'
|
indir = f'./training/{voice}/'
|
||||||
infile = f'{indir}/whisper.json'
|
infile = f'{indir}/whisper.json'
|
||||||
messages = []
|
|
||||||
normalize = True
|
|
||||||
phonemize = args.tokenizer_json is not None and args.tokenizer_json[-8:] == "ipa.json"
|
|
||||||
if args.tts_backend == "vall-e":
|
|
||||||
phonemize = True
|
|
||||||
|
|
||||||
if not os.path.exists(infile):
|
if not os.path.exists(infile):
|
||||||
raise Exception(f"Missing dataset: {infile}")
|
raise Exception(f"Missing dataset: {infile}")
|
||||||
|
|
||||||
results = json.load(open(infile, 'r', encoding="utf-8"))
|
results = json.load(open(infile, 'r', encoding="utf-8"))
|
||||||
|
|
||||||
lines = {
|
|
||||||
'training': [],
|
|
||||||
'validation': []
|
|
||||||
}
|
|
||||||
|
|
||||||
already_segmented = []
|
|
||||||
|
|
||||||
errored = 0
|
errored = 0
|
||||||
|
messages = []
|
||||||
|
normalize = True
|
||||||
|
phonemize = args.tokenizer_json is not None and args.tokenizer_json[-8:] == "ipa.json"
|
||||||
|
lines = { 'training': [], 'validation': [] }
|
||||||
|
segments = {}
|
||||||
|
|
||||||
|
if args.tts_backend == "vall-e":
|
||||||
|
phonemize = True
|
||||||
|
|
||||||
for filename in results:
|
for filename in results:
|
||||||
use_segment = use_segments
|
use_segment = use_segments
|
||||||
|
|
||||||
result = results[filename]
|
result = results[filename]
|
||||||
language = LANGUAGES[result['language']] if result['language'] in LANGUAGES else None
|
language = LANGUAGES[result['language']] if result['language'] in LANGUAGES else None
|
||||||
if language == "english":
|
normalizer = EnglishTextNormalizer() if language and language == "english" else BasicTextNormalizer()
|
||||||
language = "en-us"
|
|
||||||
|
|
||||||
normalizer = None
|
|
||||||
if normalize:
|
|
||||||
normalizer = EnglishTextNormalizer() if language and language == "english" else BasicTextNormalizer()
|
|
||||||
|
|
||||||
# check if unsegmented text exceeds 200 characters
|
# check if unsegmented text exceeds 200 characters
|
||||||
if not use_segment:
|
if not use_segment:
|
||||||
|
@ -1349,84 +1370,84 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
||||||
messages.append(message)
|
messages.append(message)
|
||||||
use_segment = True
|
use_segment = True
|
||||||
|
|
||||||
segments = result['segments'] if use_segment else [{'text': result['text']}]
|
# implicitly segment
|
||||||
|
if use_segment and not use_segments:
|
||||||
|
tmp = {}
|
||||||
|
tmp[filename] = result
|
||||||
|
print(f"Audio not segmented, segmenting: {filename}")
|
||||||
|
message = slice_dataset( voice, results=tmp )
|
||||||
|
print(message)
|
||||||
|
messages = messages + message.split("\n")
|
||||||
|
|
||||||
for segment in enumerate_progress(segments, desc="Parsing segments", progress=progress):
|
if not use_segment:
|
||||||
file = filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav") if use_segment else filename
|
segments[filename] = {
|
||||||
path = f'{indir}/audio/{file}'
|
'text': result['text'],
|
||||||
# segment when needed
|
'language': language,
|
||||||
if not os.path.exists(path) and filename not in already_segmented:
|
'normalizer': normalizer,
|
||||||
already_segmented.append(filename)
|
'phonemes': result['phonemes'] if 'phonemes' in result else None
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
for segment in result['segments']:
|
||||||
|
segments[filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav")] = {
|
||||||
|
'text': segment['text'],
|
||||||
|
'language': language,
|
||||||
|
'normalizer': normalizer,
|
||||||
|
'phonemes': segment['phonemes'] if 'phonemes' in segment else None
|
||||||
|
}
|
||||||
|
|
||||||
tmp_results = {}
|
for file in enumerate_progress(segments, desc="Parsing segments", progress=progress):
|
||||||
tmp_results[filename] = result
|
result = segments[file]
|
||||||
print(f"Audio not segmented, segmenting: {filename}")
|
path = f'{indir}/audio/{file}'
|
||||||
message = slice_dataset( voice, results=tmp_results )
|
|
||||||
print(message)
|
|
||||||
messages = messages + message.split("\n")
|
|
||||||
|
|
||||||
if not os.path.exists(path):
|
text = result['text']
|
||||||
message = f"Missing source audio: {file}"
|
language = result['language']
|
||||||
print(message)
|
normalizer = result['normalizer']
|
||||||
messages.append(message)
|
phonemes = result['phonemes']
|
||||||
errored += 1
|
if phonemize and phonemes is None:
|
||||||
continue
|
phonemes = phonemizer( text, language=language if language != "english" else "en-us", strip=True, preserve_punctuation=True, with_stress=True, backend=args.phonemizer_backend )
|
||||||
|
if phonemize:
|
||||||
|
text = phonemes
|
||||||
|
|
||||||
text = segment['text'].strip()
|
if len(text) > 200:
|
||||||
normalized_text = normalizer(text) if normalize else None
|
message = f"Text length too long (200 < {len(text)}), skipping... {file}"
|
||||||
try:
|
print(message)
|
||||||
phonemes = phonemizer( text, language=language, preserve_punctuation=True, strip=True ) if phonemize else None
|
messages.append(message)
|
||||||
if phonemize:
|
errored += 1
|
||||||
text = phonemes
|
continue
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
waveform, sample_rate = torchaudio.load(path)
|
||||||
|
num_channels, num_frames = waveform.shape
|
||||||
|
duration = num_frames / sample_rate
|
||||||
|
|
||||||
if len(text) > 200:
|
error = validate_waveform( waveform, sample_rate )
|
||||||
message = f"Text length too long (200 < {len(text)}), skipping... {file}"
|
if error:
|
||||||
print(message)
|
message = f"{error}, skipping... {file}"
|
||||||
messages.append(message)
|
print(message)
|
||||||
errored += 1
|
messages.append(message)
|
||||||
continue
|
errored += 1
|
||||||
|
continue
|
||||||
|
|
||||||
waveform, sample_rate = torchaudio.load(path)
|
culled = len(text) < text_length
|
||||||
num_channels, num_frames = waveform.shape
|
if not culled and audio_length > 0:
|
||||||
duration = num_frames / sample_rate
|
culled = duration < audio_length
|
||||||
|
|
||||||
error = validate_waveform( waveform, sample_rate )
|
line = f'audio/{file}|{text}'
|
||||||
if error:
|
|
||||||
message = f"{error}, skipping... {file}"
|
|
||||||
print(message)
|
|
||||||
messages.append(message)
|
|
||||||
errored += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
lines['training' if not culled else 'validation'].append(line)
|
||||||
|
|
||||||
culled = len(text) < text_length
|
if culled or args.tts_backend != "vall-e":
|
||||||
if not culled and audio_length > 0:
|
continue
|
||||||
culled = duration < audio_length
|
|
||||||
|
|
||||||
line = f'audio/{file}|{text}'
|
os.makedirs(f'{indir}/valle/', exist_ok=True)
|
||||||
|
|
||||||
lines['training' if not culled else 'validation'].append(line)
|
from vall_e.emb.qnt import encode as quantize
|
||||||
|
# from vall_e.emb.g2p import encode as phonemize
|
||||||
|
|
||||||
if culled or args.tts_backend != "vall-e":
|
quantized = quantize( waveform, sample_rate ).cpu()
|
||||||
continue
|
print("Quantized:", file)
|
||||||
|
|
||||||
os.makedirs(f'{indir}/valle/', exist_ok=True)
|
torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}')
|
||||||
|
open(f'{indir}/valle/{file.replace(".wav",".phn.txt")}', 'w', encoding='utf-8').write(text)
|
||||||
from vall_e.emb.qnt import encode as quantize
|
|
||||||
# from vall_e.emb.g2p import encode as phonemize
|
|
||||||
|
|
||||||
if waveform.shape[0] == 2:
|
|
||||||
waveform = waveform[:1]
|
|
||||||
|
|
||||||
quantized = quantize( waveform, sample_rate ).cpu()
|
|
||||||
torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}')
|
|
||||||
|
|
||||||
# phonemes = phonemizer(normalized_text)
|
|
||||||
open(f'{indir}/valle/{file.replace(".wav",".phn.txt")}', 'w', encoding='utf-8').write(text)
|
|
||||||
|
|
||||||
training_joined = "\n".join(lines['training'])
|
training_joined = "\n".join(lines['training'])
|
||||||
validation_joined = "\n".join(lines['validation'])
|
validation_joined = "\n".join(lines['validation'])
|
||||||
|
@ -1803,9 +1824,9 @@ def tokenize_text( text ):
|
||||||
load_tts()
|
load_tts()
|
||||||
|
|
||||||
encoded = tts.tokenizer.encode(text)
|
encoded = tts.tokenizer.encode(text)
|
||||||
decoded = tts.tokenizer.tokenizer.decode(encoded, skip_special_tokens=False).replace(" ", "").replace("[SPACE]", " ")
|
decoded = tts.tokenizer.tokenizer.decode(encoded, skip_special_tokens=False).split(" ")
|
||||||
|
|
||||||
return "\n".join([ str(encoded), decoded ])
|
return "\n".join([ str(encoded), str(decoded) ])
|
||||||
|
|
||||||
def get_dataset_list(dir="./training/"):
|
def get_dataset_list(dir="./training/"):
|
||||||
return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.txt" in os.listdir(os.path.join(dir, d)) ])
|
return sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) and "train.txt" in os.listdir(os.path.join(dir, d)) ])
|
||||||
|
@ -1928,6 +1949,8 @@ def setup_args():
|
||||||
'vocoder-model': VOCODERS[-1],
|
'vocoder-model': VOCODERS[-1],
|
||||||
'tokenizer-json': None,
|
'tokenizer-json': None,
|
||||||
|
|
||||||
|
'phonemizer-backend': 'espeak',
|
||||||
|
|
||||||
'whisper-backend': 'openai/whisper',
|
'whisper-backend': 'openai/whisper',
|
||||||
'whisper-model': "base",
|
'whisper-model': "base",
|
||||||
|
|
||||||
|
@ -1972,6 +1995,8 @@ def setup_args():
|
||||||
parser.add_argument("--vocoder-model", default=default_arguments['vocoder-model'], action='store_true', help="Specifies with vocoder to use")
|
parser.add_argument("--vocoder-model", default=default_arguments['vocoder-model'], action='store_true', help="Specifies with vocoder to use")
|
||||||
parser.add_argument("--tokenizer-json", default=default_arguments['tokenizer-json'], help="Specifies which tokenizer json to use for tokenizing.")
|
parser.add_argument("--tokenizer-json", default=default_arguments['tokenizer-json'], help="Specifies which tokenizer json to use for tokenizing.")
|
||||||
|
|
||||||
|
parser.add_argument("--phonemizer-backend", default=default_arguments['phonemizer-backend'], help="Specifies which phonemizer backend to use.")
|
||||||
|
|
||||||
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)")
|
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)")
|
||||||
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
|
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
|
||||||
|
|
||||||
|
@ -2037,6 +2062,8 @@ def get_default_settings( hypenated=True ):
|
||||||
'vocoder-model': args.vocoder_model,
|
'vocoder-model': args.vocoder_model,
|
||||||
'tokenizer-json': args.tokenizer_json,
|
'tokenizer-json': args.tokenizer_json,
|
||||||
|
|
||||||
|
'phonemizer-backend': args.phonemizer_backend,
|
||||||
|
|
||||||
'whisper-backend': args.whisper_backend,
|
'whisper-backend': args.whisper_backend,
|
||||||
'whisper-model': args.whisper_model,
|
'whisper-model': args.whisper_model,
|
||||||
|
|
||||||
|
@ -2081,6 +2108,8 @@ def update_args( **kwargs ):
|
||||||
args.vocoder_model = settings['vocoder_model']
|
args.vocoder_model = settings['vocoder_model']
|
||||||
args.tokenizer_json = settings['tokenizer_json']
|
args.tokenizer_json = settings['tokenizer_json']
|
||||||
|
|
||||||
|
args.phonemizer_backend = settings['phonemizer_backend']
|
||||||
|
|
||||||
args.whisper_backend = settings['whisper_backend']
|
args.whisper_backend = settings['whisper_backend']
|
||||||
args.whisper_model = settings['whisper_model']
|
args.whisper_model = settings['whisper_model']
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user