forked from mrq/ai-voice-cloning
whispercpp actually works now (language loading was weird, slicing needed to divide time by 100), transcribing audio checks for silence and discards them
This commit is contained in:
parent
b8a620e8d7
commit
d97639e138
22
src/utils.py
22
src/utils.py
|
@ -39,6 +39,7 @@ from tortoise.utils.device import get_device_name, set_device_name
|
||||||
MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
|
MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
|
||||||
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large"]
|
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large"]
|
||||||
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
|
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
|
||||||
|
EPOCH_SCHEDULE = [ 9, 18, 25, 33 ]
|
||||||
|
|
||||||
args = None
|
args = None
|
||||||
tts = None
|
tts = None
|
||||||
|
@ -997,11 +998,12 @@ def whisper_transcribe( file, language=None ):
|
||||||
}
|
}
|
||||||
for segment in segments:
|
for segment in segments:
|
||||||
reparsed = {
|
reparsed = {
|
||||||
'start': segment[0],
|
'start': segment[0] / 100.0,
|
||||||
'end': segment[1],
|
'end': segment[1] / 100.0,
|
||||||
'text': segment[2],
|
'text': segment[2],
|
||||||
}
|
}
|
||||||
result['segments'].append(reparsed)
|
result['segments'].append(reparsed)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@ -1014,24 +1016,29 @@ def prepare_dataset( files, outdir, language=None, progress=None ):
|
||||||
|
|
||||||
os.makedirs(outdir, exist_ok=True)
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
|
||||||
idx = 0
|
|
||||||
results = {}
|
results = {}
|
||||||
transcription = []
|
transcription = []
|
||||||
|
|
||||||
for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress):
|
for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress):
|
||||||
|
basename = os.path.basename(file)
|
||||||
result = whisper_transcribe(file, language=language)
|
result = whisper_transcribe(file, language=language)
|
||||||
results[os.path.basename(file)] = result
|
results[basename] = result
|
||||||
print(f"Transcribed file: {file}, {len(result['segments'])} found.")
|
print(f"Transcribed file: {file}, {len(result['segments'])} found.")
|
||||||
|
|
||||||
waveform, sampling_rate = torchaudio.load(file)
|
waveform, sampling_rate = torchaudio.load(file)
|
||||||
num_channels, num_frames = waveform.shape
|
num_channels, num_frames = waveform.shape
|
||||||
|
|
||||||
|
idx = 0
|
||||||
for segment in result['segments']: # enumerate_progress(result['segments'], desc="Segmenting voice file", progress=progress):
|
for segment in result['segments']: # enumerate_progress(result['segments'], desc="Segmenting voice file", progress=progress):
|
||||||
start = int(segment['start'] * sampling_rate)
|
start = int(segment['start'] * sampling_rate)
|
||||||
end = int(segment['end'] * sampling_rate)
|
end = int(segment['end'] * sampling_rate)
|
||||||
|
|
||||||
sliced_waveform = waveform[:, start:end]
|
sliced_waveform = waveform[:, start:end]
|
||||||
sliced_name = f"{pad(idx, 4)}.wav"
|
sliced_name = basename.replace(".wav", f"_{pad(idx, 4)}.wav")
|
||||||
|
|
||||||
|
if not torch.any(sliced_waveform < 0):
|
||||||
|
print(f"Error with {sliced_name}, skipping...")
|
||||||
|
continue
|
||||||
|
|
||||||
torchaudio.save(f"{outdir}/{sliced_name}", sliced_waveform, sampling_rate)
|
torchaudio.save(f"{outdir}/{sliced_name}", sliced_waveform, sampling_rate)
|
||||||
|
|
||||||
|
@ -1056,7 +1063,6 @@ def calc_iterations( epochs, lines, batch_size ):
|
||||||
iterations = int(epochs * lines / float(batch_size))
|
iterations = int(epochs * lines / float(batch_size))
|
||||||
return iterations
|
return iterations
|
||||||
|
|
||||||
EPOCH_SCHEDULE = [ 9, 18, 25, 33 ]
|
|
||||||
def schedule_learning_rate( iterations, schedule=EPOCH_SCHEDULE ):
|
def schedule_learning_rate( iterations, schedule=EPOCH_SCHEDULE ):
|
||||||
return [int(iterations * d) for d in schedule]
|
return [int(iterations * d) for d in schedule]
|
||||||
|
|
||||||
|
@ -1750,12 +1756,14 @@ def load_whisper_model(language=None, model_name=None, progress=None):
|
||||||
print(f"Loading specialized model for language: {language}")
|
print(f"Loading specialized model for language: {language}")
|
||||||
|
|
||||||
notify_progress(f"Loading Whisper model: {model_name}", progress)
|
notify_progress(f"Loading Whisper model: {model_name}", progress)
|
||||||
|
|
||||||
if args.whisper_cpp:
|
if args.whisper_cpp:
|
||||||
from whispercpp import Whisper
|
from whispercpp import Whisper
|
||||||
if not language:
|
if not language:
|
||||||
language = 'auto'
|
language = 'auto'
|
||||||
|
|
||||||
whisper_model = Whisper(model_name, models_dir='./models/', language=language.encode('ascii'))
|
b_lang = language.encode('ascii')
|
||||||
|
whisper_model = Whisper(model_name, models_dir='./models/', language=b_lang)
|
||||||
else:
|
else:
|
||||||
import whisper
|
import whisper
|
||||||
whisper_model = whisper.load_model(model_name)
|
whisper_model = whisper.load_model(model_name)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user