forked from mrq/ai-voice-cloning
Merge branch 'master' of https://git.ecker.tech/ben_mkiv/ai-voice-cloning
This commit is contained in:
commit
1ec3344999
|
@ -1,8 +1,8 @@
|
||||||
# AI Voice Cloning
|
# AI Voice Cloning
|
||||||
|
|
||||||
This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
|
> **Note** This project has been in dire need of being rewritten from the ground up for some time. Apologies for any crust from my rather spaghetti code.
|
||||||
|
|
||||||
Similar to my own findings for Stable Diffusion image generation, this rentry may appear a little disheveled as I note my new findings with TorToiSe. Please keep this in mind if the guide seems to shift a bit or sound confusing.
|
This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
|
||||||
|
|
||||||
>\>Ugh... why bother when I can just abuse 11.AI?
|
>\>Ugh... why bother when I can just abuse 11.AI?
|
||||||
|
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit 5ff00bf3bfa97e2c8e9f166b920273f83ac9d8f0
|
Subproject commit b10c58436d6871c26485d30b203e6cfdd4167602
|
|
@ -7,4 +7,5 @@ music-tag
|
||||||
voicefixer
|
voicefixer
|
||||||
psutil
|
psutil
|
||||||
phonemizer
|
phonemizer
|
||||||
pydantic==1.10.11
|
pydantic==1.10.11
|
||||||
|
websockets
|
330
src/utils.py
330
src/utils.py
|
@ -45,7 +45,7 @@ from tortoise.utils.device import get_device_name, set_device_name, get_device_c
|
||||||
|
|
||||||
MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
|
MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
|
||||||
|
|
||||||
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large"]
|
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large-v2"]
|
||||||
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
|
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
|
||||||
WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp", "m-bain/whisperx"]
|
WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp", "m-bain/whisperx"]
|
||||||
VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band']
|
VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band']
|
||||||
|
@ -61,12 +61,15 @@ RESAMPLERS = {}
|
||||||
|
|
||||||
MIN_TRAINING_DURATION = 0.6
|
MIN_TRAINING_DURATION = 0.6
|
||||||
MAX_TRAINING_DURATION = 11.6097505669
|
MAX_TRAINING_DURATION = 11.6097505669
|
||||||
|
MAX_TRAINING_CHAR_LENGTH = 200
|
||||||
|
|
||||||
VALLE_ENABLED = False
|
VALLE_ENABLED = False
|
||||||
BARK_ENABLED = False
|
BARK_ENABLED = False
|
||||||
|
|
||||||
VERBOSE_DEBUG = True
|
VERBOSE_DEBUG = True
|
||||||
|
|
||||||
|
import traceback
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from whisper.normalizers.english import EnglishTextNormalizer
|
from whisper.normalizers.english import EnglishTextNormalizer
|
||||||
from whisper.normalizers.basic import BasicTextNormalizer
|
from whisper.normalizers.basic import BasicTextNormalizer
|
||||||
|
@ -75,7 +78,7 @@ try:
|
||||||
print("Whisper detected")
|
print("Whisper detected")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if VERBOSE_DEBUG:
|
if VERBOSE_DEBUG:
|
||||||
print("Error:", e)
|
print(traceback.format_exc())
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -90,12 +93,14 @@ try:
|
||||||
VALLE_ENABLED = True
|
VALLE_ENABLED = True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if VERBOSE_DEBUG:
|
if VERBOSE_DEBUG:
|
||||||
print("Error:", e)
|
print(traceback.format_exc())
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if VALLE_ENABLED:
|
if VALLE_ENABLED:
|
||||||
TTSES.append('vall-e')
|
TTSES.append('vall-e')
|
||||||
|
|
||||||
|
# torchaudio.set_audio_backend('soundfile')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import bark
|
import bark
|
||||||
from bark import text_to_semantic
|
from bark import text_to_semantic
|
||||||
|
@ -109,35 +114,10 @@ try:
|
||||||
BARK_ENABLED = True
|
BARK_ENABLED = True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if VERBOSE_DEBUG:
|
if VERBOSE_DEBUG:
|
||||||
print("Error:", e)
|
print(traceback.format_exc())
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if BARK_ENABLED:
|
if BARK_ENABLED:
|
||||||
try:
|
|
||||||
from vocos import Vocos
|
|
||||||
VOCOS_ENABLED = True
|
|
||||||
print("Vocos detected")
|
|
||||||
except Exception as e:
|
|
||||||
if VERBOSE_DEBUG:
|
|
||||||
print("Error:", e)
|
|
||||||
VOCOS_ENABLED = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
from hubert.hubert_manager import HuBERTManager
|
|
||||||
from hubert.pre_kmeans_hubert import CustomHubert
|
|
||||||
from hubert.customtokenizer import CustomTokenizer
|
|
||||||
|
|
||||||
hubert_manager = HuBERTManager()
|
|
||||||
hubert_manager.make_sure_hubert_installed()
|
|
||||||
hubert_manager.make_sure_tokenizer_installed()
|
|
||||||
|
|
||||||
HUBERT_ENABLED = True
|
|
||||||
print("HuBERT detected")
|
|
||||||
except Exception as e:
|
|
||||||
if VERBOSE_DEBUG:
|
|
||||||
print("Error:", e)
|
|
||||||
HUBERT_ENABLED = False
|
|
||||||
|
|
||||||
TTSES.append('bark')
|
TTSES.append('bark')
|
||||||
|
|
||||||
def semantic_to_audio_tokens(
|
def semantic_to_audio_tokens(
|
||||||
|
@ -181,7 +161,32 @@ if BARK_ENABLED:
|
||||||
|
|
||||||
self.device = get_device_name()
|
self.device = get_device_name()
|
||||||
|
|
||||||
if VOCOS_ENABLED:
|
try:
|
||||||
|
from vocos import Vocos
|
||||||
|
self.vocos_enabled = True
|
||||||
|
print("Vocos detected")
|
||||||
|
except Exception as e:
|
||||||
|
if VERBOSE_DEBUG:
|
||||||
|
print(traceback.format_exc())
|
||||||
|
self.vocos_enabled = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
from hubert.hubert_manager import HuBERTManager
|
||||||
|
from hubert.pre_kmeans_hubert import CustomHubert
|
||||||
|
from hubert.customtokenizer import CustomTokenizer
|
||||||
|
|
||||||
|
hubert_manager = HuBERTManager()
|
||||||
|
hubert_manager.make_sure_hubert_installed()
|
||||||
|
hubert_manager.make_sure_tokenizer_installed()
|
||||||
|
|
||||||
|
self.hubert_enabled = True
|
||||||
|
print("HuBERT detected")
|
||||||
|
except Exception as e:
|
||||||
|
if VERBOSE_DEBUG:
|
||||||
|
print(traceback.format_exc())
|
||||||
|
self.hubert_enabled = False
|
||||||
|
|
||||||
|
if self.vocos_enabled:
|
||||||
self.vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz").to(self.device)
|
self.vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz").to(self.device)
|
||||||
|
|
||||||
def create_voice( self, voice ):
|
def create_voice( self, voice ):
|
||||||
|
@ -238,7 +243,7 @@ if BARK_ENABLED:
|
||||||
|
|
||||||
# generate semantic tokens
|
# generate semantic tokens
|
||||||
|
|
||||||
if HUBERT_ENABLED:
|
if self.hubert_enabled:
|
||||||
wav = wav.to(self.device)
|
wav = wav.to(self.device)
|
||||||
|
|
||||||
# Extract discrete codes from EnCodec
|
# Extract discrete codes from EnCodec
|
||||||
|
@ -426,7 +431,7 @@ def generate_bark(**kwargs):
|
||||||
idx_cache = {}
|
idx_cache = {}
|
||||||
for i, file in enumerate(os.listdir(outdir)):
|
for i, file in enumerate(os.listdir(outdir)):
|
||||||
filename = os.path.basename(file)
|
filename = os.path.basename(file)
|
||||||
extension = os.path.splitext(filename)[1]
|
extension = os.path.splitext(filename)[-1][1:]
|
||||||
if extension != ".json" and extension != ".wav":
|
if extension != ".json" and extension != ".wav":
|
||||||
continue
|
continue
|
||||||
match = re.findall(rf"^{cleanup_voice_name(voice)}_(\d+)(?:.+?)?{extension}$", filename)
|
match = re.findall(rf"^{cleanup_voice_name(voice)}_(\d+)(?:.+?)?{extension}$", filename)
|
||||||
|
@ -672,18 +677,23 @@ def generate_valle(**kwargs):
|
||||||
|
|
||||||
voice_cache = {}
|
voice_cache = {}
|
||||||
def fetch_voice( voice ):
|
def fetch_voice( voice ):
|
||||||
|
if voice in voice_cache:
|
||||||
|
return voice_cache[voice]
|
||||||
voice_dir = f'./training/{voice}/audio/'
|
voice_dir = f'./training/{voice}/audio/'
|
||||||
if not os.path.isdir(voice_dir):
|
|
||||||
|
if not os.path.isdir(voice_dir) or len(os.listdir(voice_dir)) == 0:
|
||||||
voice_dir = f'./voices/{voice}/'
|
voice_dir = f'./voices/{voice}/'
|
||||||
|
|
||||||
files = [ f'{voice_dir}/{d}' for d in os.listdir(voice_dir) if d[-4:] == ".wav" ]
|
files = [ f'{voice_dir}/{d}' for d in os.listdir(voice_dir) if d[-4:] == ".wav" ]
|
||||||
# return files
|
# return files
|
||||||
return random.choice(files)
|
voice_cache[voice] = random.choice(files)
|
||||||
|
return voice_cache[voice]
|
||||||
|
|
||||||
def get_settings( override=None ):
|
def get_settings( override=None ):
|
||||||
settings = {
|
settings = {
|
||||||
'ar_temp': float(parameters['temperature']),
|
'ar_temp': float(parameters['temperature']),
|
||||||
'nar_temp': float(parameters['temperature']),
|
'nar_temp': float(parameters['temperature']),
|
||||||
'max_ar_samples': parameters['num_autoregressive_samples'],
|
'max_ar_steps': parameters['num_autoregressive_samples'],
|
||||||
}
|
}
|
||||||
|
|
||||||
# could be better to just do a ternary on everything above, but i am not a professional
|
# could be better to just do a ternary on everything above, but i am not a professional
|
||||||
|
@ -697,7 +707,7 @@ def generate_valle(**kwargs):
|
||||||
continue
|
continue
|
||||||
settings[k] = override[k]
|
settings[k] = override[k]
|
||||||
|
|
||||||
settings['reference'] = fetch_voice(voice=selected_voice)
|
settings['references'] = [ fetch_voice(voice=selected_voice) for _ in range(3) ]
|
||||||
return settings
|
return settings
|
||||||
|
|
||||||
if not parameters['delimiter']:
|
if not parameters['delimiter']:
|
||||||
|
@ -723,7 +733,7 @@ def generate_valle(**kwargs):
|
||||||
idx_cache = {}
|
idx_cache = {}
|
||||||
for i, file in enumerate(os.listdir(outdir)):
|
for i, file in enumerate(os.listdir(outdir)):
|
||||||
filename = os.path.basename(file)
|
filename = os.path.basename(file)
|
||||||
extension = os.path.splitext(filename)[1]
|
extension = os.path.splitext(filename)[-1][1:]
|
||||||
if extension != ".json" and extension != ".wav":
|
if extension != ".json" and extension != ".wav":
|
||||||
continue
|
continue
|
||||||
match = re.findall(rf"^{voice}_(\d+)(?:.+?)?{extension}$", filename)
|
match = re.findall(rf"^{voice}_(\d+)(?:.+?)?{extension}$", filename)
|
||||||
|
@ -783,11 +793,14 @@ def generate_valle(**kwargs):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception("Prompt settings editing requested, but received invalid JSON")
|
raise Exception("Prompt settings editing requested, but received invalid JSON")
|
||||||
|
|
||||||
settings = get_settings( override=override )
|
name = get_name(line=line, candidate=0)
|
||||||
reference = settings['reference']
|
|
||||||
settings.pop("reference")
|
|
||||||
|
|
||||||
gen = tts.inference(cut_text, reference, **settings )
|
settings = get_settings( override=override )
|
||||||
|
references = settings['references']
|
||||||
|
settings.pop("references")
|
||||||
|
settings['out_path'] = f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav'
|
||||||
|
|
||||||
|
gen = tts.inference(cut_text, references, **settings )
|
||||||
|
|
||||||
run_time = time.time()-start_time
|
run_time = time.time()-start_time
|
||||||
print(f"Generating line took {run_time} seconds")
|
print(f"Generating line took {run_time} seconds")
|
||||||
|
@ -805,7 +818,7 @@ def generate_valle(**kwargs):
|
||||||
|
|
||||||
# save here in case some error happens mid-batch
|
# save here in case some error happens mid-batch
|
||||||
#torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', wav.cpu(), sr)
|
#torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', wav.cpu(), sr)
|
||||||
soundfile.write(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', wav.cpu()[0,0], sr)
|
#soundfile.write(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', wav.cpu()[0,0], sr)
|
||||||
wav, sr = torchaudio.load(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav')
|
wav, sr = torchaudio.load(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav')
|
||||||
|
|
||||||
audio_cache[name] = {
|
audio_cache[name] = {
|
||||||
|
@ -1085,7 +1098,7 @@ def generate_tortoise(**kwargs):
|
||||||
idx_cache = {}
|
idx_cache = {}
|
||||||
for i, file in enumerate(os.listdir(outdir)):
|
for i, file in enumerate(os.listdir(outdir)):
|
||||||
filename = os.path.basename(file)
|
filename = os.path.basename(file)
|
||||||
extension = os.path.splitext(filename)[1]
|
extension = os.path.splitext(filename)[-1][1:]
|
||||||
if extension != ".json" and extension != ".wav":
|
if extension != ".json" and extension != ".wav":
|
||||||
continue
|
continue
|
||||||
match = re.findall(rf"^{voice}_(\d+)(?:.+?)?{extension}$", filename)
|
match = re.findall(rf"^{voice}_(\d+)(?:.+?)?{extension}$", filename)
|
||||||
|
@ -1605,30 +1618,18 @@ class TrainingState():
|
||||||
if args.tts_backend == "vall-e":
|
if args.tts_backend == "vall-e":
|
||||||
keys['lrs'] = [
|
keys['lrs'] = [
|
||||||
'ar.lr', 'nar.lr',
|
'ar.lr', 'nar.lr',
|
||||||
'ar-half.lr', 'nar-half.lr',
|
|
||||||
'ar-quarter.lr', 'nar-quarter.lr',
|
|
||||||
]
|
]
|
||||||
keys['losses'] = [
|
keys['losses'] = [
|
||||||
'ar.loss', 'nar.loss', 'ar+nar.loss',
|
# 'ar.loss', 'nar.loss', 'ar+nar.loss',
|
||||||
'ar-half.loss', 'nar-half.loss', 'ar-half+nar-half.loss',
|
|
||||||
'ar-quarter.loss', 'nar-quarter.loss', 'ar-quarter+nar-quarter.loss',
|
|
||||||
|
|
||||||
'ar.loss.nll', 'nar.loss.nll',
|
'ar.loss.nll', 'nar.loss.nll',
|
||||||
'ar-half.loss.nll', 'nar-half.loss.nll',
|
|
||||||
'ar-quarter.loss.nll', 'nar-quarter.loss.nll',
|
|
||||||
]
|
]
|
||||||
|
|
||||||
keys['accuracies'] = [
|
keys['accuracies'] = [
|
||||||
'ar.loss.acc', 'nar.loss.acc',
|
'ar.loss.acc', 'nar.loss.acc',
|
||||||
'ar-half.loss.acc', 'nar-half.loss.acc',
|
'ar.stats.acc', 'nar.loss.acc',
|
||||||
'ar-quarter.loss.acc', 'nar-quarter.loss.acc',
|
|
||||||
]
|
]
|
||||||
keys['precisions'] = [
|
keys['precisions'] = [ 'ar.loss.precision', 'nar.loss.precision', ]
|
||||||
'ar.loss.precision', 'nar.loss.precision',
|
keys['grad_norms'] = ['ar.grad_norm', 'nar.grad_norm']
|
||||||
'ar-half.loss.precision', 'nar-half.loss.precision',
|
|
||||||
'ar-quarter.loss.precision', 'nar-quarter.loss.precision',
|
|
||||||
]
|
|
||||||
keys['grad_norms'] = ['ar.grad_norm', 'nar.grad_norm', 'ar-half.grad_norm', 'nar-half.grad_norm', 'ar-quarter.grad_norm', 'nar-quarter.grad_norm']
|
|
||||||
|
|
||||||
for k in keys['lrs']:
|
for k in keys['lrs']:
|
||||||
if k not in self.info:
|
if k not in self.info:
|
||||||
|
@ -1746,7 +1747,8 @@ class TrainingState():
|
||||||
if args.tts_backend == "tortoise":
|
if args.tts_backend == "tortoise":
|
||||||
logs = sorted([f'{self.training_dir}/finetune/{d}' for d in os.listdir(f'{self.training_dir}/finetune/') if d[-4:] == ".log" ])
|
logs = sorted([f'{self.training_dir}/finetune/{d}' for d in os.listdir(f'{self.training_dir}/finetune/') if d[-4:] == ".log" ])
|
||||||
else:
|
else:
|
||||||
logs = sorted([f'{self.training_dir}/logs/{d}/log.txt' for d in os.listdir(f'{self.training_dir}/logs/') ])
|
log_dir = "logs"
|
||||||
|
logs = sorted([f'{self.training_dir}/{log_dir}/{d}/log.txt' for d in os.listdir(f'{self.training_dir}/{log_dir}/') ])
|
||||||
|
|
||||||
if update:
|
if update:
|
||||||
logs = [logs[-1]]
|
logs = [logs[-1]]
|
||||||
|
@ -2219,6 +2221,8 @@ def transcribe_dataset( voice, language=None, skip_existings=False, progress=Non
|
||||||
files = get_voice(voice, load_latents=False)
|
files = get_voice(voice, load_latents=False)
|
||||||
indir = f'./training/{voice}/'
|
indir = f'./training/{voice}/'
|
||||||
infile = f'{indir}/whisper.json'
|
infile = f'{indir}/whisper.json'
|
||||||
|
|
||||||
|
quantize_in_memory = args.tts_backend == "vall-e"
|
||||||
|
|
||||||
os.makedirs(f'{indir}/audio/', exist_ok=True)
|
os.makedirs(f'{indir}/audio/', exist_ok=True)
|
||||||
|
|
||||||
|
@ -2245,13 +2249,24 @@ def transcribe_dataset( voice, language=None, skip_existings=False, progress=Non
|
||||||
continue
|
continue
|
||||||
|
|
||||||
results[basename] = result
|
results[basename] = result
|
||||||
waveform, sample_rate = torchaudio.load(file)
|
|
||||||
# resample to the input rate, since it'll get resampled for training anyways
|
if not quantize_in_memory:
|
||||||
# this should also "help" increase throughput a bit when filling the dataloaders
|
waveform, sample_rate = torchaudio.load(file)
|
||||||
waveform, sample_rate = resample(waveform, sample_rate, TARGET_SAMPLE_RATE)
|
# resample to the input rate, since it'll get resampled for training anyways
|
||||||
if waveform.shape[0] == 2:
|
# this should also "help" increase throughput a bit when filling the dataloaders
|
||||||
waveform = waveform[:1]
|
waveform, sample_rate = resample(waveform, sample_rate, TARGET_SAMPLE_RATE)
|
||||||
torchaudio.save(f"{indir}/audio/{basename}", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
|
if waveform.shape[0] == 2:
|
||||||
|
waveform = waveform[:1]
|
||||||
|
|
||||||
|
try:
|
||||||
|
kwargs = {}
|
||||||
|
if basename[-4:] == ".wav":
|
||||||
|
kwargs['encoding'] = "PCM_S"
|
||||||
|
kwargs['bits_per_sample'] = 16
|
||||||
|
|
||||||
|
torchaudio.save(f"{indir}/audio/{basename}", waveform, sample_rate, **kwargs)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
with open(infile, 'w', encoding="utf-8") as f:
|
with open(infile, 'w', encoding="utf-8") as f:
|
||||||
f.write(json.dumps(results, indent='\t'))
|
f.write(json.dumps(results, indent='\t'))
|
||||||
|
@ -2317,6 +2332,9 @@ def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, resul
|
||||||
segments = 0
|
segments = 0
|
||||||
for filename in results:
|
for filename in results:
|
||||||
path = f'./voices/{voice}/{filename}'
|
path = f'./voices/{voice}/{filename}'
|
||||||
|
extension = os.path.splitext(filename)[-1][1:]
|
||||||
|
out_extension = extension # "wav"
|
||||||
|
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
path = f'./training/{voice}/{filename}'
|
path = f'./training/{voice}/{filename}'
|
||||||
|
|
||||||
|
@ -2333,7 +2351,7 @@ def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, resul
|
||||||
duration = num_frames / sample_rate
|
duration = num_frames / sample_rate
|
||||||
|
|
||||||
for segment in result['segments']:
|
for segment in result['segments']:
|
||||||
file = filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav")
|
file = filename.replace(f".{extension}", f"_{pad(segment['id'], 4)}.{out_extension}")
|
||||||
|
|
||||||
sliced, error = slice_waveform( waveform, sample_rate, segment['start'] + start_offset, segment['end'] + end_offset, trim_silence )
|
sliced, error = slice_waveform( waveform, sample_rate, segment['start'] + start_offset, segment['end'] + end_offset, trim_silence )
|
||||||
if error:
|
if error:
|
||||||
|
@ -2341,12 +2359,18 @@ def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, resul
|
||||||
print(message)
|
print(message)
|
||||||
messages.append(message)
|
messages.append(message)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
sliced, _ = resample( sliced, sample_rate, TARGET_SAMPLE_RATE )
|
sliced, _ = resample( sliced, sample_rate, TARGET_SAMPLE_RATE )
|
||||||
|
|
||||||
if waveform.shape[0] == 2:
|
if waveform.shape[0] == 2:
|
||||||
waveform = waveform[:1]
|
waveform = waveform[:1]
|
||||||
|
|
||||||
torchaudio.save(f"{indir}/audio/{file}", sliced, TARGET_SAMPLE_RATE, encoding="PCM_S", bits_per_sample=16)
|
kwargs = {}
|
||||||
|
if file[-4:] == ".wav":
|
||||||
|
kwargs['encoding'] = "PCM_S"
|
||||||
|
kwargs['bits_per_sample'] = 16
|
||||||
|
|
||||||
|
torchaudio.save(f"{indir}/audio/{file}", sliced, TARGET_SAMPLE_RATE, **kwargs)
|
||||||
|
|
||||||
segments +=1
|
segments +=1
|
||||||
|
|
||||||
|
@ -2462,18 +2486,32 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
||||||
|
|
||||||
errored = 0
|
errored = 0
|
||||||
messages = []
|
messages = []
|
||||||
normalize = True
|
normalize = False # True
|
||||||
phonemize = should_phonemize()
|
phonemize = should_phonemize()
|
||||||
lines = { 'training': [], 'validation': [] }
|
lines = { 'training': [], 'validation': [] }
|
||||||
segments = {}
|
segments = {}
|
||||||
|
|
||||||
|
quantize_in_memory = args.tts_backend == "vall-e"
|
||||||
|
|
||||||
if args.tts_backend != "tortoise":
|
if args.tts_backend != "tortoise":
|
||||||
text_length = 0
|
text_length = 0
|
||||||
audio_length = 0
|
audio_length = 0
|
||||||
|
|
||||||
|
start_offset = -0.1
|
||||||
|
end_offset = 0.1
|
||||||
|
trim_silence = False
|
||||||
|
|
||||||
|
TARGET_SAMPLE_RATE = 22050
|
||||||
|
if args.tts_backend != "tortoise":
|
||||||
|
TARGET_SAMPLE_RATE = 24000
|
||||||
|
if tts:
|
||||||
|
TARGET_SAMPLE_RATE = tts.input_sample_rate
|
||||||
|
|
||||||
for filename in tqdm(results, desc="Parsing results"):
|
for filename in tqdm(results, desc="Parsing results"):
|
||||||
use_segment = use_segments
|
use_segment = use_segments
|
||||||
|
|
||||||
|
extension = os.path.splitext(filename)[-1][1:]
|
||||||
|
out_extension = extension # "wav"
|
||||||
result = results[filename]
|
result = results[filename]
|
||||||
lang = result['language']
|
lang = result['language']
|
||||||
language = LANGUAGES[lang] if lang in LANGUAGES else lang
|
language = LANGUAGES[lang] if lang in LANGUAGES else lang
|
||||||
|
@ -2481,8 +2519,8 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
||||||
|
|
||||||
# check if unsegmented text exceeds 200 characters
|
# check if unsegmented text exceeds 200 characters
|
||||||
if not use_segment:
|
if not use_segment:
|
||||||
if len(result['text']) > 200:
|
if len(result['text']) > MAX_TRAINING_CHAR_LENGTH:
|
||||||
message = f"Text length too long (200 < {len(result['text'])}), using segments: {filename}"
|
message = f"Text length too long ({MAX_TRAINING_CHAR_LENGTH} < {len(result['text'])}), using segments: {filename}"
|
||||||
print(message)
|
print(message)
|
||||||
messages.append(message)
|
messages.append(message)
|
||||||
use_segment = True
|
use_segment = True
|
||||||
|
@ -2490,13 +2528,15 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
||||||
# check if unsegmented audio exceeds 11.6s
|
# check if unsegmented audio exceeds 11.6s
|
||||||
if not use_segment:
|
if not use_segment:
|
||||||
path = f'{indir}/audio/{filename}'
|
path = f'{indir}/audio/{filename}'
|
||||||
if not os.path.exists(path):
|
if not quantize_in_memory and not os.path.exists(path):
|
||||||
messages.append(f"Missing source audio: {filename}")
|
messages.append(f"Missing source audio: {filename}")
|
||||||
errored += 1
|
errored += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
metadata = torchaudio.info(path)
|
duration = 0
|
||||||
duration = metadata.num_frames / metadata.sample_rate
|
for segment in result['segments']:
|
||||||
|
duration = max(duration, result['segments'][segment]['end'])
|
||||||
|
|
||||||
if duration >= MAX_TRAINING_DURATION:
|
if duration >= MAX_TRAINING_DURATION:
|
||||||
message = f"Audio too large, using segments: {filename}"
|
message = f"Audio too large, using segments: {filename}"
|
||||||
print(message)
|
print(message)
|
||||||
|
@ -2511,19 +2551,36 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
||||||
if duration <= MIN_TRAINING_DURATION or MAX_TRAINING_DURATION <= duration:
|
if duration <= MIN_TRAINING_DURATION or MAX_TRAINING_DURATION <= duration:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
path = f'{indir}/audio/' + filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav")
|
path = f'{indir}/audio/' + filename.replace(f".{extension}", f"_{pad(segment['id'], 4)}.{out_extension}")
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
continue
|
continue
|
||||||
exists = False
|
exists = False
|
||||||
break
|
break
|
||||||
|
|
||||||
if not exists:
|
if not quantize_in_memory and not exists:
|
||||||
tmp = {}
|
tmp = {}
|
||||||
tmp[filename] = result
|
tmp[filename] = result
|
||||||
print(f"Audio not segmented, segmenting: {filename}")
|
print(f"Audio not segmented, segmenting: {filename}")
|
||||||
message = slice_dataset( voice, results=tmp )
|
message = slice_dataset( voice, results=tmp )
|
||||||
print(message)
|
print(message)
|
||||||
messages = messages + message.split("\n")
|
messages = messages + message.split("\n")
|
||||||
|
|
||||||
|
waveform = None
|
||||||
|
|
||||||
|
|
||||||
|
if quantize_in_memory:
|
||||||
|
path = f'{indir}/audio/{filename}'
|
||||||
|
if not os.path.exists(path):
|
||||||
|
path = f'./voices/{voice}/{filename}'
|
||||||
|
|
||||||
|
if not os.path.exists(path):
|
||||||
|
message = f"Audio not found: {path}"
|
||||||
|
print(message)
|
||||||
|
messages.append(message)
|
||||||
|
#continue
|
||||||
|
else:
|
||||||
|
waveform = torchaudio.load(path)
|
||||||
|
waveform = resample(waveform[0], waveform[1], TARGET_SAMPLE_RATE)
|
||||||
|
|
||||||
if not use_segment:
|
if not use_segment:
|
||||||
segments[filename] = {
|
segments[filename] = {
|
||||||
|
@ -2533,13 +2590,18 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
||||||
'normalizer': normalizer,
|
'normalizer': normalizer,
|
||||||
'phonemes': result['phonemes'] if 'phonemes' in result else None
|
'phonemes': result['phonemes'] if 'phonemes' in result else None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if waveform:
|
||||||
|
segments[filename]['waveform'] = waveform
|
||||||
else:
|
else:
|
||||||
for segment in result['segments']:
|
for segment in result['segments']:
|
||||||
duration = segment['end'] - segment['start']
|
duration = segment['end'] - segment['start']
|
||||||
if duration <= MIN_TRAINING_DURATION or MAX_TRAINING_DURATION <= duration:
|
if duration <= MIN_TRAINING_DURATION or MAX_TRAINING_DURATION <= duration:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
segments[filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav")] = {
|
file = filename.replace(f".{extension}", f"_{pad(segment['id'], 4)}.{out_extension}")
|
||||||
|
|
||||||
|
segments[file] = {
|
||||||
'text': segment['text'],
|
'text': segment['text'],
|
||||||
'lang': lang,
|
'lang': lang,
|
||||||
'language': language,
|
'language': language,
|
||||||
|
@ -2547,22 +2609,27 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
||||||
'phonemes': segment['phonemes'] if 'phonemes' in segment else None
|
'phonemes': segment['phonemes'] if 'phonemes' in segment else None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if waveform:
|
||||||
|
sliced, error = slice_waveform( waveform[0], waveform[1], segment['start'] + start_offset, segment['end'] + end_offset, trim_silence )
|
||||||
|
if error:
|
||||||
|
message = f"{error}, skipping... {file}"
|
||||||
|
print(message)
|
||||||
|
messages.append(message)
|
||||||
|
segments[file]['error'] = error
|
||||||
|
#continue
|
||||||
|
else:
|
||||||
|
segments[file]['waveform'] = (sliced, waveform[1])
|
||||||
|
|
||||||
jobs = {
|
jobs = {
|
||||||
'quantize': [[], []],
|
'quantize': [[], []],
|
||||||
'phonemize': [[], []],
|
'phonemize': [[], []],
|
||||||
}
|
}
|
||||||
|
|
||||||
for file in tqdm(segments, desc="Parsing segments"):
|
for file in tqdm(segments, desc="Parsing segments"):
|
||||||
|
extension = os.path.splitext(file)[-1][1:]
|
||||||
result = segments[file]
|
result = segments[file]
|
||||||
path = f'{indir}/audio/{file}'
|
path = f'{indir}/audio/{file}'
|
||||||
|
|
||||||
if not os.path.exists(path):
|
|
||||||
message = f"Missing segment, skipping... {file}"
|
|
||||||
print(message)
|
|
||||||
messages.append(message)
|
|
||||||
errored += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
text = result['text']
|
text = result['text']
|
||||||
lang = result['lang']
|
lang = result['lang']
|
||||||
language = result['language']
|
language = result['language']
|
||||||
|
@ -2573,28 +2640,20 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
||||||
|
|
||||||
normalized = normalizer(text) if normalize else text
|
normalized = normalizer(text) if normalize else text
|
||||||
|
|
||||||
if len(text) > 200:
|
if len(text) > MAX_TRAINING_CHAR_LENGTH:
|
||||||
message = f"Text length too long (200 < {len(text)}), skipping... {file}"
|
message = f"Text length too long ({MAX_TRAINING_CHAR_LENGTH} < {len(text)}), skipping... {file}"
|
||||||
print(message)
|
print(message)
|
||||||
messages.append(message)
|
messages.append(message)
|
||||||
errored += 1
|
errored += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
waveform, sample_rate = torchaudio.load(path)
|
# num_channels, num_frames = waveform.shape
|
||||||
num_channels, num_frames = waveform.shape
|
#duration = num_frames / sample_rate
|
||||||
duration = num_frames / sample_rate
|
|
||||||
|
|
||||||
error = validate_waveform( waveform, sample_rate )
|
|
||||||
if error:
|
|
||||||
message = f"{error}, skipping... {file}"
|
|
||||||
print(message)
|
|
||||||
messages.append(message)
|
|
||||||
errored += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
culled = len(text) < text_length
|
culled = len(text) < text_length
|
||||||
if not culled and audio_length > 0:
|
#if not culled and audio_length > 0:
|
||||||
culled = duration < audio_length
|
# culled = duration < audio_length
|
||||||
|
|
||||||
line = f'audio/{file}|{phonemes if phonemize and phonemes else text}'
|
line = f'audio/{file}|{phonemes if phonemize and phonemes else text}'
|
||||||
|
|
||||||
|
@ -2605,17 +2664,8 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
||||||
|
|
||||||
os.makedirs(f'{indir}/valle/', exist_ok=True)
|
os.makedirs(f'{indir}/valle/', exist_ok=True)
|
||||||
|
|
||||||
qnt_file = f'{indir}/valle/{file.replace(".wav",".qnt.pt")}'
|
#phn_file = f'{indir}/valle/{file.replace(f".{extension}",".phn.txt")}'
|
||||||
if not os.path.exists(qnt_file):
|
phn_file = f'./training/valle/data/{voice}/{file.replace(f".{extension}",".phn.txt")}'
|
||||||
jobs['quantize'][0].append(qnt_file)
|
|
||||||
jobs['quantize'][1].append((waveform, sample_rate))
|
|
||||||
"""
|
|
||||||
quantized = valle_quantize( waveform, sample_rate ).cpu()
|
|
||||||
torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}')
|
|
||||||
print("Quantized:", file)
|
|
||||||
"""
|
|
||||||
|
|
||||||
phn_file = f'{indir}/valle/{file.replace(".wav",".phn.txt")}'
|
|
||||||
if not os.path.exists(phn_file):
|
if not os.path.exists(phn_file):
|
||||||
jobs['phonemize'][0].append(phn_file)
|
jobs['phonemize'][0].append(phn_file)
|
||||||
jobs['phonemize'][1].append(normalized)
|
jobs['phonemize'][1].append(normalized)
|
||||||
|
@ -2625,13 +2675,46 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
||||||
print("Phonemized:", file, normalized, text)
|
print("Phonemized:", file, normalized, text)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
#qnt_file = f'{indir}/valle/{file.replace(f".{extension}",".qnt.pt")}'
|
||||||
|
qnt_file = f'./training/valle/data/{voice}/{file.replace(f".{extension}",".qnt.pt")}'
|
||||||
|
if 'error' not in result:
|
||||||
|
if not quantize_in_memory and not os.path.exists(path):
|
||||||
|
message = f"Missing segment, skipping... {file}"
|
||||||
|
print(message)
|
||||||
|
messages.append(message)
|
||||||
|
errored += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not os.path.exists(qnt_file):
|
||||||
|
waveform = None
|
||||||
|
if 'waveform' in result:
|
||||||
|
waveform, sample_rate = result['waveform']
|
||||||
|
elif os.path.exists(path):
|
||||||
|
waveform, sample_rate = torchaudio.load(path)
|
||||||
|
error = validate_waveform( waveform, sample_rate )
|
||||||
|
if error:
|
||||||
|
message = f"{error}, skipping... {file}"
|
||||||
|
print(message)
|
||||||
|
messages.append(message)
|
||||||
|
errored += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if waveform is not None:
|
||||||
|
jobs['quantize'][0].append(qnt_file)
|
||||||
|
jobs['quantize'][1].append((waveform, sample_rate))
|
||||||
|
"""
|
||||||
|
quantized = valle_quantize( waveform, sample_rate ).cpu()
|
||||||
|
torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}')
|
||||||
|
print("Quantized:", file)
|
||||||
|
"""
|
||||||
|
|
||||||
for i in tqdm(range(len(jobs['quantize'][0])), desc="Quantizing"):
|
for i in tqdm(range(len(jobs['quantize'][0])), desc="Quantizing"):
|
||||||
qnt_file = jobs['quantize'][0][i]
|
qnt_file = jobs['quantize'][0][i]
|
||||||
waveform, sample_rate = jobs['quantize'][1][i]
|
waveform, sample_rate = jobs['quantize'][1][i]
|
||||||
|
|
||||||
quantized = valle_quantize( waveform, sample_rate ).cpu()
|
quantized = valle_quantize( waveform, sample_rate ).cpu()
|
||||||
torch.save(quantized, qnt_file)
|
torch.save(quantized, qnt_file)
|
||||||
print("Quantized:", qnt_file)
|
#print("Quantized:", qnt_file)
|
||||||
|
|
||||||
for i in tqdm(range(len(jobs['phonemize'][0])), desc="Phonemizing"):
|
for i in tqdm(range(len(jobs['phonemize'][0])), desc="Phonemizing"):
|
||||||
phn_file = jobs['phonemize'][0][i]
|
phn_file = jobs['phonemize'][0][i]
|
||||||
|
@ -2640,7 +2723,7 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
||||||
try:
|
try:
|
||||||
phonemized = valle_phonemize( normalized )
|
phonemized = valle_phonemize( normalized )
|
||||||
open(phn_file, 'w', encoding='utf-8').write(" ".join(phonemized))
|
open(phn_file, 'w', encoding='utf-8').write(" ".join(phonemized))
|
||||||
print("Phonemized:", phn_file)
|
#print("Phonemized:", phn_file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
message = f"Failed to phonemize: {phn_file}: {normalized}"
|
message = f"Failed to phonemize: {phn_file}: {normalized}"
|
||||||
messages.append(message)
|
messages.append(message)
|
||||||
|
@ -2970,17 +3053,26 @@ def import_voices(files, saveAs=None, progress=None):
|
||||||
def relative_paths( dirs ):
|
def relative_paths( dirs ):
|
||||||
return [ './' + os.path.relpath( d ).replace("\\", "/") for d in dirs ]
|
return [ './' + os.path.relpath( d ).replace("\\", "/") for d in dirs ]
|
||||||
|
|
||||||
def get_voice( name, dir=get_voice_dir(), load_latents=True ):
|
def get_voice( name, dir=get_voice_dir(), load_latents=True, extensions=["wav", "mp3", "flac"] ):
|
||||||
subj = f'{dir}/{name}/'
|
subj = f'{dir}/{name}/'
|
||||||
if not os.path.isdir(subj):
|
if not os.path.isdir(subj):
|
||||||
return
|
return
|
||||||
|
files = os.listdir(subj)
|
||||||
voice = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.flac'))
|
|
||||||
if load_latents:
|
if load_latents:
|
||||||
voice = voice + list(glob(f'{subj}/*.pth'))
|
extensions.append("pth")
|
||||||
|
|
||||||
|
voice = []
|
||||||
|
for file in files:
|
||||||
|
ext = os.path.splitext(file)[-1][1:]
|
||||||
|
if ext not in extensions:
|
||||||
|
continue
|
||||||
|
|
||||||
|
voice.append(f'{subj}/{file}')
|
||||||
|
|
||||||
return sorted( voice )
|
return sorted( voice )
|
||||||
|
|
||||||
def get_voice_list(dir=get_voice_dir(), append_defaults=False):
|
def get_voice_list(dir=get_voice_dir(), append_defaults=False, extensions=["wav", "mp3", "flac", "pth"]):
|
||||||
defaults = [ "random", "microphone" ]
|
defaults = [ "random", "microphone" ]
|
||||||
os.makedirs(dir, exist_ok=True)
|
os.makedirs(dir, exist_ok=True)
|
||||||
#res = sorted([d for d in os.listdir(dir) if d not in defaults and os.path.isdir(os.path.join(dir, d)) and len(os.listdir(os.path.join(dir, d))) > 0 ])
|
#res = sorted([d for d in os.listdir(dir) if d not in defaults and os.path.isdir(os.path.join(dir, d)) and len(os.listdir(os.path.join(dir, d))) > 0 ])
|
||||||
|
@ -2993,7 +3085,7 @@ def get_voice_list(dir=get_voice_dir(), append_defaults=False):
|
||||||
continue
|
continue
|
||||||
if len(os.listdir(os.path.join(dir, name))) == 0:
|
if len(os.listdir(os.path.join(dir, name))) == 0:
|
||||||
continue
|
continue
|
||||||
files = get_voice( name, dir=dir )
|
files = get_voice( name, dir=dir, extensions=extensions )
|
||||||
|
|
||||||
if len(files) > 0:
|
if len(files) > 0:
|
||||||
res.append(name)
|
res.append(name)
|
||||||
|
@ -3001,7 +3093,7 @@ def get_voice_list(dir=get_voice_dir(), append_defaults=False):
|
||||||
for subdir in os.listdir(f'{dir}/{name}'):
|
for subdir in os.listdir(f'{dir}/{name}'):
|
||||||
if not os.path.isdir(f'{dir}/{name}/{subdir}'):
|
if not os.path.isdir(f'{dir}/{name}/{subdir}'):
|
||||||
continue
|
continue
|
||||||
files = get_voice( f'{name}/{subdir}', dir=dir )
|
files = get_voice( f'{name}/{subdir}', dir=dir, extensions=extensions )
|
||||||
if len(files) == 0:
|
if len(files) == 0:
|
||||||
continue
|
continue
|
||||||
res.append(f'{name}/{subdir}')
|
res.append(f'{name}/{subdir}')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user