forked from camenduru/ai-voice-cloning
multichannel audio now report correct duration (surprised it took this long for me to source multichannel audio)
This commit is contained in:
parent
32d968a8cd
commit
dadb1fca6b
10
src/utils.py
10
src/utils.py
|
@ -519,7 +519,7 @@ def update_baseline_for_latents_chunks( voice ):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
metadata = torchaudio.info(f'{path}/{file}')
|
metadata = torchaudio.info(f'{path}/{file}')
|
||||||
duration = metadata.num_channels * metadata.num_frames / metadata.sample_rate
|
duration = metadata.num_frames / metadata.sample_rate
|
||||||
total_duration += duration
|
total_duration += duration
|
||||||
total = total + 1
|
total = total + 1
|
||||||
|
|
||||||
|
@ -1079,7 +1079,7 @@ def validate_waveform( waveform, sample_rate, min_only=False ):
|
||||||
return "Waveform is empty"
|
return "Waveform is empty"
|
||||||
|
|
||||||
num_channels, num_frames = waveform.shape
|
num_channels, num_frames = waveform.shape
|
||||||
duration = num_channels * num_frames / sample_rate
|
duration = num_frames / sample_rate
|
||||||
|
|
||||||
if duration < MIN_TRAINING_DURATION:
|
if duration < MIN_TRAINING_DURATION:
|
||||||
return "Duration too short ({:.3f}s < {:.3f}s)".format(duration, MIN_TRAINING_DURATION)
|
return "Duration too short ({:.3f}s < {:.3f}s)".format(duration, MIN_TRAINING_DURATION)
|
||||||
|
@ -1176,7 +1176,7 @@ def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, resul
|
||||||
result = results[filename]
|
result = results[filename]
|
||||||
waveform, sample_rate = torchaudio.load(path)
|
waveform, sample_rate = torchaudio.load(path)
|
||||||
num_channels, num_frames = waveform.shape
|
num_channels, num_frames = waveform.shape
|
||||||
duration = num_channels * num_frames / sample_rate
|
duration = num_frames / sample_rate
|
||||||
|
|
||||||
for segment in result['segments']:
|
for segment in result['segments']:
|
||||||
file = filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav")
|
file = filename.replace(".wav", f"_{pad(segment['id'], 4)}.wav")
|
||||||
|
@ -1234,7 +1234,7 @@ def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=F
|
||||||
continue
|
continue
|
||||||
|
|
||||||
metadata = torchaudio.info(path)
|
metadata = torchaudio.info(path)
|
||||||
duration = metadata.num_channels * metadata.num_frames / metadata.sample_rate
|
duration = metadata.num_frames / metadata.sample_rate
|
||||||
if duration >= MAX_TRAINING_DURATION:
|
if duration >= MAX_TRAINING_DURATION:
|
||||||
message = f"Audio too large, using segments: {filename}"
|
message = f"Audio too large, using segments: {filename}"
|
||||||
print(message)
|
print(message)
|
||||||
|
@ -1285,7 +1285,7 @@ def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=F
|
||||||
culled = len(text) < text_length
|
culled = len(text) < text_length
|
||||||
if not culled and audio_length > 0:
|
if not culled and audio_length > 0:
|
||||||
num_channels, num_frames = waveform.shape
|
num_channels, num_frames = waveform.shape
|
||||||
duration = num_channels * num_frames / sample_rate
|
duration = num_frames / sample_rate
|
||||||
culled = duration < audio_length
|
culled = duration < audio_length
|
||||||
|
|
||||||
# for when i add in a little treat ;), as it requires normalized text
|
# for when i add in a little treat ;), as it requires normalized text
|
||||||
|
|
Loading…
Reference in New Issue
Block a user