This commit is contained in:
mrq 2023-03-21 15:39:28 +00:00
parent fe24641763
commit a4afad8837
264 changed files with 8518 additions and 5851 deletions

72
.gitignore vendored
View File

@ -1,30 +1,3 @@
dlas/experiments/*
dlas/codes/*.txt
dlas/codes/wandb/*
dlas/codes/pretrained_models/*
dlas/codes/scripts/audio/pretrained_models/*
results/*
tb_logger/*
datasets/*
options/*
data/*
.vscode
*.html
*.png
*.jpg
*.gif
*.pth
*.pytorch
*.zip
*.cu
*.pt
*.pth
*.pdf
*.tsv
# template
# Byte-compiled / optimized / DLL files
__pycache__/
@ -36,6 +9,7 @@ __pycache__/
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
@ -47,12 +21,9 @@ lib64/
parts/
sdist/
var/
wheels/
pretrained/*
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
@ -72,9 +43,8 @@ htmlcov/
.cache
nosetests.xml
coverage.xml
*.cover
*,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
@ -83,14 +53,6 @@ coverage.xml
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
@ -98,36 +60,8 @@ docs/_build/
# PyBuilder
target/
# Jupyter Notebook
#Ipython Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/

View File

@ -1 +1 @@
recursive-include codes/*
recursive-include dlas/*

View File

@ -3,7 +3,7 @@ import torch
import torch.utils.data
from munch import munchify
from utils.util import opt_get
from dlas.utils.util import opt_get
def create_dataloader(dataset, dataset_opt, opt=None, sampler=None, collate_fn=None, shuffle=True):
@ -33,77 +33,90 @@ def create_dataset(dataset_opt, return_collate=False):
# datasets for image restoration
if mode == 'fullimage':
from data.images.full_image_dataset import FullImageDataset as D
from dlas.data.images.full_image_dataset import FullImageDataset as D
elif mode == 'single_image_extensible':
from data.images.single_image_dataset import SingleImageDataset as D
from dlas.data.images.single_image_dataset import \
SingleImageDataset as D
elif mode == 'multi_frame_extensible':
from data.images.multi_frame_dataset import MultiFrameDataset as D
from dlas.data.images.multi_frame_dataset import MultiFrameDataset as D
elif mode == 'combined':
from data.combined_dataset import CombinedDataset as D
from dlas.data.combined_dataset import CombinedDataset as D
elif mode == 'multiscale':
from data.images.multiscale_dataset import MultiScaleDataset as D
from dlas.data.images.multiscale_dataset import MultiScaleDataset as D
elif mode == 'paired_frame':
from data.images.paired_frame_dataset import PairedFrameDataset as D
from dlas.data.images.paired_frame_dataset import \
PairedFrameDataset as D
elif mode == 'stylegan2':
from data.images.stylegan2_dataset import Stylegan2Dataset as D
from dlas.data.images.stylegan2_dataset import Stylegan2Dataset as D
elif mode == 'imagefolder':
from data.images.image_folder_dataset import ImageFolderDataset as D
from dlas.data.images.image_folder_dataset import \
ImageFolderDataset as D
elif mode == 'torch_dataset':
from data.torch_dataset import TorchDataset as D
elif mode == 'byol_dataset':
from data.images.byol_attachment import ByolDatasetWrapper as D
from dlas.data.images.byol_attachment import ByolDatasetWrapper as D
elif mode == 'byol_structured_dataset':
from data.images.byol_attachment import StructuredCropDatasetWrapper as D
from dlas.data.images.byol_attachment import \
StructuredCropDatasetWrapper as D
elif mode == 'random_aug_wrapper':
from data.images.byol_attachment import DatasetRandomAugWrapper as D
from dlas.data.images.byol_attachment import \
DatasetRandomAugWrapper as D
elif mode == 'random_dataset':
from data.images.random_dataset import RandomDataset as D
from dlas.data.images.random_dataset import RandomDataset as D
elif mode == 'zipfile':
from data.images.zip_file_dataset import ZipFileDataset as D
from dlas.data.images.zip_file_dataset import ZipFileDataset as D
elif mode == 'nv_tacotron':
from data.audio.nv_tacotron_dataset import TextWavLoader as D
from data.audio.nv_tacotron_dataset import TextMelCollate as C
from models.audio.tts.tacotron2 import create_hparams
from dlas.data.audio.nv_tacotron_dataset import TextMelCollate as C
from dlas.data.audio.nv_tacotron_dataset import TextWavLoader as D
from dlas.models.audio.tts.tacotron2 import create_hparams
default_params = create_hparams()
default_params.update(dataset_opt)
dataset_opt = munchify(default_params)
if opt_get(dataset_opt, ['needs_collate'], True):
collate = C()
elif mode == 'paired_voice_audio':
from data.audio.paired_voice_audio_dataset import TextWavLoader as D
from models.audio.tts.tacotron2 import create_hparams
from dlas.data.audio.paired_voice_audio_dataset import \
TextWavLoader as D
from dlas.models.audio.tts.tacotron2 import create_hparams
default_params = create_hparams()
default_params.update(dataset_opt)
dataset_opt = munchify(default_params)
elif mode == 'fast_paired_voice_audio':
from data.audio.fast_paired_dataset import FastPairedVoiceDataset as D
from models.audio.tts.tacotron2 import create_hparams
from dlas.data.audio.fast_paired_dataset import \
FastPairedVoiceDataset as D
from dlas.models.audio.tts.tacotron2 import create_hparams
default_params = create_hparams()
default_params.update(dataset_opt)
dataset_opt = munchify(default_params)
elif mode == 'fast_paired_voice_audio_with_phonemes':
from data.audio.fast_paired_dataset_with_phonemes import FastPairedVoiceDataset as D
from models.audio.tts.tacotron2 import create_hparams
from dlas.data.audio.fast_paired_dataset_with_phonemes import \
FastPairedVoiceDataset as D
from dlas.models.audio.tts.tacotron2 import create_hparams
default_params = create_hparams()
default_params.update(dataset_opt)
dataset_opt = munchify(default_params)
elif mode == 'gpt_tts':
from data.audio.gpt_tts_dataset import GptTtsDataset as D
from data.audio.gpt_tts_dataset import GptTtsCollater as C
from dlas.data.audio.gpt_tts_dataset import GptTtsCollater as C
from dlas.data.audio.gpt_tts_dataset import GptTtsDataset as D
collate = C(dataset_opt)
elif mode == 'unsupervised_audio':
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset as D
from dlas.data.audio.unsupervised_audio_dataset import \
UnsupervisedAudioDataset as D
elif mode == 'unsupervised_audio_with_noise':
from data.audio.audio_with_noise_dataset import AudioWithNoiseDataset as D
from dlas.data.audio.audio_with_noise_dataset import \
AudioWithNoiseDataset as D
elif mode == 'preprocessed_mel':
from data.audio.preprocessed_mel_dataset import PreprocessedMelDataset as D
from dlas.data.audio.preprocessed_mel_dataset import \
PreprocessedMelDataset as D
elif mode == 'grand_conjoined_voice':
from data.audio.grand_conjoined_dataset import GrandConjoinedDataset as D
from data.zero_pad_dict_collate import ZeroPadDictCollate as C
from dlas.data.audio.grand_conjoined_dataset import \
GrandConjoinedDataset as D
from dlas.data.zero_pad_dict_collate import ZeroPadDictCollate as C
if opt_get(dataset_opt, ['needs_collate'], False):
collate = C()
else:
raise NotImplementedError('Dataset [{:s}] is not recognized.'.format(mode))
raise NotImplementedError(
'Dataset [{:s}] is not recognized.'.format(mode))
dataset = D(dataset_opt)
if return_collate:
@ -115,9 +128,10 @@ def create_dataset(dataset_opt, return_collate=False):
def get_dataset_debugger(dataset_opt):
mode = dataset_opt['mode']
if mode == 'paired_voice_audio':
from data.audio.paired_voice_audio_dataset import PairedVoiceDebugger
from dlas.data.audio.paired_voice_audio_dataset import \
PairedVoiceDebugger
return PairedVoiceDebugger()
elif mode == 'fast_paired_voice_audio':
from data.audio.fast_paired_dataset import FastPairedVoiceDebugger
from dlas.data.audio.fast_paired_dataset import FastPairedVoiceDebugger
return FastPairedVoiceDebugger()
return None
return None

View File

View File

@ -2,18 +2,17 @@ import random
import sys
from math import pi
import librosa
import torch
import torch.nn.functional as F
import torchaudio
from torch.utils.data import Dataset
from tqdm import tqdm
import torch.nn.functional as F
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio
from data.util import load_paths_from_cache, find_files_of_type, is_audio_file
# Just all ones.
from utils.util import opt_get
from dlas.data.audio.unsupervised_audio_dataset import (
UnsupervisedAudioDataset, load_audio)
from dlas.data.util import (find_files_of_type, is_audio_file,
load_paths_from_cache)
from dlas.utils.util import opt_get
def _integration_fn_fully_enabled(n):
@ -23,7 +22,7 @@ def _integration_fn_fully_enabled(n):
# Randomly assigns up to 5 blocks of the output tensor the value '1'. Rest is zero
def _integration_fn_spiky(n):
fn = torch.zeros((n,))
spikes = random.randint(1,5)
spikes = random.randint(1, 5)
for _ in range(spikes):
sz = random.randint(n//8, n//2)
pos = random.randint(0, n)
@ -35,18 +34,19 @@ def _integration_fn_spiky(n):
# Uses a sinusoidal ramp up and down (of random length) to a peak which is held for a random duration.
def _integration_fn_smooth(n):
center = random.randint(1, n-2)
max_duration=n-center-1
max_duration = n-center-1
duration = random.randint(max_duration//4, max_duration)
end = center+duration
ramp_up_sz = random.randint(n//16,n//4)
ramp_up = torch.sin(pi*torch.arange(0,ramp_up_sz)/(2*ramp_up_sz))
ramp_up_sz = random.randint(n//16, n//4)
ramp_up = torch.sin(pi*torch.arange(0, ramp_up_sz)/(2*ramp_up_sz))
if ramp_up_sz > center:
ramp_up = ramp_up[(ramp_up_sz-center):]
ramp_up_sz = center
ramp_down_sz = random.randint(n//16,n//4)
ramp_down = torch.flip(torch.sin(pi*torch.arange(0,ramp_down_sz)/(2*ramp_down_sz)), dims=[0])
ramp_down_sz = random.randint(n//16, n//4)
ramp_down = torch.flip(
torch.sin(pi*torch.arange(0, ramp_down_sz)/(2*ramp_down_sz)), dims=[0])
if ramp_down_sz > (n-end):
ramp_down = ramp_down[:(n-end)]
ramp_down_sz = n-end
@ -71,16 +71,22 @@ def load_rir(path, sr, max_sz):
Wraps a unsupervised_audio_dataset and applies noise to the output clips, then provides labels depending on what
noise was added.
'''
class AudioWithNoiseDataset(Dataset):
def __init__(self, opt):
self.underlying_dataset = UnsupervisedAudioDataset(opt)
self.env_noise_paths = load_paths_from_cache(opt['env_noise_paths'], opt['env_noise_cache'])
self.music_paths = load_paths_from_cache(opt['music_paths'], opt['music_cache'])
self.openair_paths = find_files_of_type('img', opt['openair_path'], qualifier=is_audio_file)[0]
self.env_noise_paths = load_paths_from_cache(
opt['env_noise_paths'], opt['env_noise_cache'])
self.music_paths = load_paths_from_cache(
opt['music_paths'], opt['music_cache'])
self.openair_paths = find_files_of_type(
'img', opt['openair_path'], qualifier=is_audio_file)[0]
self.min_volume = opt_get(opt, ['min_noise_volume'], .2)
self.max_volume = opt_get(opt, ['max_noise_volume'], .5)
self.sampling_rate = self.underlying_dataset.sampling_rate
self.use_gpu_for_reverb_compute = opt_get(opt, ['use_gpu_for_reverb_compute'], True)
self.use_gpu_for_reverb_compute = opt_get(
opt, ['use_gpu_for_reverb_compute'], True)
self.openair_kernels = None
self.current_item_fetch = 0
self.fetch_error_count = 0
@ -90,7 +96,8 @@ class AudioWithNoiseDataset(Dataset):
# Load the openair reverbs as CUDA tensors.
self.openair_kernels = []
for oa in self.openair_paths:
self.openair_kernels.append(load_rir(oa, self.underlying_dataset.sampling_rate, self.underlying_dataset.sampling_rate*2).cuda())
self.openair_kernels.append(load_rir(
oa, self.underlying_dataset.sampling_rate, self.underlying_dataset.sampling_rate*2).cuda())
def __getitem__(self, item):
if self.current_item_fetch != item:
@ -113,10 +120,11 @@ class AudioWithNoiseDataset(Dataset):
clip = clip * clipvol
label = random.randint(0, 4) # Current excludes GSM corruption.
#label = 3
# label = 3
if label > 0 and label < 4: # 0 is basically "leave it alone"
aug_needed = True
augvol = (random.random() * (self.max_volume-self.min_volume) + self.min_volume)
augvol = (random.random() * (self.max_volume -
self.min_volume) + self.min_volume)
if label == 1:
# Add environmental noise.
augpath = random.choice(self.env_noise_paths)
@ -131,13 +139,15 @@ class AudioWithNoiseDataset(Dataset):
# This can take two forms:
if padding_room < 22000 or random.random() < .5:
# (1) The voices talk over one another. If there is no padding room, we always take this choice.
intg_fns = [_integration_fn_smooth, _integration_fn_fully_enabled]
intg_fns = [_integration_fn_smooth,
_integration_fn_fully_enabled]
else:
# (2) There are simply two voices in the clip, separated from one another.
# This is a special case that does not use the same logic as the rest of the augmentations.
aug = load_audio(augpath, self.underlying_dataset.sampling_rate)
aug = load_audio(
augpath, self.underlying_dataset.sampling_rate)
# Pad with some random silence
aug = F.pad(aug, (random.randint(20,4000), 0))
aug = F.pad(aug, (random.randint(20, 4000), 0))
# Fit what we can given the padding room we have.
aug = aug[:, :padding_room]
clip = torch.cat([clip, aug], dim=1)
@ -146,7 +156,8 @@ class AudioWithNoiseDataset(Dataset):
out['clip_lengths'] = torch.tensor(clip.shape[-1])
aug_needed = False
if aug_needed:
aug = load_audio(augpath, self.underlying_dataset.sampling_rate)
aug = load_audio(
augpath, self.underlying_dataset.sampling_rate)
if aug.shape[1] > clip.shape[1]:
n, cn = aug.shape[1], clip.shape[1]
gap = n-cn
@ -157,7 +168,8 @@ class AudioWithNoiseDataset(Dataset):
if aug.shape[1] < clip.shape[1]:
gap = clip.shape[1] - aug.shape[1]
placement = random.randint(0, gap-1)
aug = torch.nn.functional.pad(aug, (placement, gap-placement))
aug = torch.nn.functional.pad(
aug, (placement, gap-placement))
clip = clip + aug
elif label == 4:
# Perform reverb (to simulate being in a large room with an omni-mic). This is performed by convolving
@ -166,19 +178,23 @@ class AudioWithNoiseDataset(Dataset):
rir = random.choice(self.openair_kernels)
else:
augpath = random.choice(self.openair_paths)
rir = load_rir(augpath, self.underlying_dataset.sampling_rate, clip.shape[-1])
rir = load_rir(
augpath, self.underlying_dataset.sampling_rate, clip.shape[-1])
clip = torch.nn.functional.pad(clip, (rir.shape[1]-1, 0))
if self.use_gpu_for_reverb_compute:
clip = clip.cuda()
clip = torch.nn.functional.conv1d(clip.unsqueeze(0), rir.unsqueeze(0)).squeeze(0).cpu()
clip = torch.nn.functional.conv1d(
clip.unsqueeze(0), rir.unsqueeze(0)).squeeze(0).cpu()
elif label == 5:
# Apply the GSM codec to simulate cellular phone audio.
clip = torchaudio.functional.apply_codec(clip, self.underlying_dataset.sampling_rate, format="gsm")
clip = torchaudio.functional.apply_codec(
clip, self.underlying_dataset.sampling_rate, format="gsm")
except:
if self.fetch_error_count > 10:
print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
print(
f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
print(sys.exc_info())
#raise # Uncomment to surface exceptions.
# raise # Uncomment to surface exceptions.
self.fetch_error_count += 1
return self[item]
@ -187,7 +203,7 @@ class AudioWithNoiseDataset(Dataset):
clip = F.pad(clip, (0, padding_room))
out['clip'] = clip
out['label'] = label
#out['aug'] = aug
# out['aug'] = aug
out['augpath'] = augpath
out['augvol'] = augvol
out['clipvol'] = clipvol
@ -216,14 +232,15 @@ if __name__ == '__main__':
'openair_path': 'D:\\data\\audio\\openair\\resampled',
'use_gpu_for_reverb_compute': False,
}
from data import create_dataset, create_dataloader, util
from data import create_dataloader, create_dataset, util
ds = create_dataset(params)
dl = create_dataloader(ds, params, pin_memory=False)
i = 0
for b in tqdm(dl):
for b_ in range(b['clip'].shape[0]):
#torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_][:, :b['clip_lengths'][b_]], ds.sampling_rate)
#torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
# torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_][:, :b['clip_lengths'][b_]], ds.sampling_rate)
# torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
print(
f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
i += 1

View File

@ -12,13 +12,15 @@ import torchaudio
from tqdm import tqdm
from transformers import Wav2Vec2CTCTokenizer
from data.audio.paired_voice_audio_dataset import CharacterTokenizer
from data.audio.unsupervised_audio_dataset import load_audio, load_similar_clips
from utils.util import opt_get
from dlas.data.audio.paired_voice_audio_dataset import CharacterTokenizer
from dlas.data.audio.unsupervised_audio_dataset import (load_audio,
load_similar_clips)
from dlas.utils.util import opt_get
def parse_tsv_aligned_codes(line, base_path):
fpt = line.strip().split('\t')
def convert_string_list_to_tensor(strlist):
if strlist.startswith('['):
strlist = strlist[1:]
@ -43,6 +45,7 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
The upshot is that this dataset loads extremely quickly and consumes almost no system memory.
"""
def __init__(self, hparams):
self.paths = hparams['path']
if not isinstance(self.paths, list):
@ -52,26 +55,33 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
self.types = opt_get(hparams, ['types'], [0 for _ in self.paths])
self.load_conditioning = opt_get(hparams, ['load_conditioning'], False)
self.conditioning_candidates = opt_get(hparams, ['num_conditioning_candidates'], 1)
self.conditioning_length = opt_get(hparams, ['conditioning_length'], 44100)
self.produce_ctc_metadata = opt_get(hparams, ['produce_ctc_metadata'], False)
self.debug_failures = opt_get(hparams, ['debug_loading_failures'], False)
self.conditioning_candidates = opt_get(
hparams, ['num_conditioning_candidates'], 1)
self.conditioning_length = opt_get(
hparams, ['conditioning_length'], 44100)
self.produce_ctc_metadata = opt_get(
hparams, ['produce_ctc_metadata'], False)
self.debug_failures = opt_get(
hparams, ['debug_loading_failures'], False)
self.text_cleaners = hparams.text_cleaners
self.sample_rate = hparams.sample_rate
self.aligned_codes_to_audio_ratio = 443 * self.sample_rate // 22050
self.max_wav_len = opt_get(hparams, ['max_wav_length'], None)
self.load_aligned_codes = opt_get(hparams, ['load_aligned_codes'], False)
self.load_aligned_codes = opt_get(
hparams, ['load_aligned_codes'], False)
if self.max_wav_len is not None:
self.max_aligned_codes = self.max_wav_len // self.aligned_codes_to_audio_ratio
self.max_text_len = opt_get(hparams, ['max_text_length'], None)
assert self.max_wav_len is not None and self.max_text_len is not None
self.use_bpe_tokenizer = opt_get(hparams, ['use_bpe_tokenizer'], False)
if self.use_bpe_tokenizer:
from data.audio.voice_tokenizer import VoiceBpeTokenizer
self.tokenizer = VoiceBpeTokenizer(opt_get(hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json'))
from dlas.data.audio.voice_tokenizer import VoiceBpeTokenizer
self.tokenizer = VoiceBpeTokenizer(opt_get(
hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json'))
else:
self.tokenizer = CharacterTokenizer()
self.skipped_items = 0 # records how many items are skipped when accessing an index.
# records how many items are skipped when accessing an index.
self.skipped_items = 0
self.load_times = torch.zeros((256,))
self.load_ind = 0
@ -110,7 +120,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
try: # This can fail when seeking to a UTF-8 escape byte.
f.readline()
except:
return self.load_random_line(depth=depth + 1), type # On failure, just recurse and try again.
# On failure, just recurse and try again.
return self.load_random_line(depth=depth + 1), type
l2 = f.readline()
if l2:
@ -119,14 +130,16 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
return parse_tsv_aligned_codes(l2, base_path), type
except:
print(f"error parsing random offset: {sys.exc_info()}")
return self.load_random_line(depth=depth+1), type # On failure, just recurse and try again.
# On failure, just recurse and try again.
return self.load_random_line(depth=depth+1), type
def get_ctc_metadata(self, codes):
grouped = groupby(codes.tolist())
rcodes, repeats, seps = [], [], [0]
for val, group in grouped:
if val == 0:
seps[-1] = len(list(group)) # This is a very important distinction! It means the padding belongs to the character proceeding it.
# This is a very important distinction! It means the padding belongs to the character proceeding it.
seps[-1] = len(list(group))
else:
rcodes.append(val)
repeats.append(len(list(group)))
@ -142,7 +155,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
if rcodes.shape[0] < self.max_text_len:
gap = self.max_text_len - rcodes.shape[0]
rcodes = F.pad(rcodes, (0, gap))
repeats = F.pad(repeats, (0, gap), value=1) # The minimum value for repeats is 1, hence this is the pad value too.
# The minimum value for repeats is 1, hence this is the pad value too.
repeats = F.pad(repeats, (0, gap), value=1)
seps = F.pad(seps, (0, gap))
elif rcodes.shape[0] > self.max_text_len:
rcodes = rcodes[:self.max_text_len]
@ -165,7 +179,7 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
if text is None or len(text.strip()) == 0:
raise ValueError
cond, cond_is_self = load_similar_clips(apt[0], self.conditioning_length, self.sample_rate,
n=self.conditioning_candidates) if self.load_conditioning else (None, False)
n=self.conditioning_candidates) if self.load_conditioning else (None, False)
except:
if self.skipped_items > 100:
raise # Rethrow if we have nested too far.
@ -179,12 +193,13 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
self.skipped_items = 0
if wav is None or \
(self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len) or \
(self.max_text_len is not None and tseq.shape[0] > self.max_text_len):
(self.max_text_len is not None and tseq.shape[0] > self.max_text_len):
# Basically, this audio file is nonexistent or too long to be supported by the dataset.
# It's hard to handle this situation properly. Best bet is to return the a random valid token and skew the dataset somewhat as a result.
if self.debug_failures:
print(f"error loading {path}: ranges are out of bounds; {wav.shape[-1]}, {tseq.shape[0]}")
rv = random.randint(0,len(self)-1)
print(
f"error loading {path}: ranges are out of bounds; {wav.shape[-1]}, {tseq.shape[0]}")
rv = random.randint(0, len(self)-1)
return self[rv]
orig_output = wav.shape[-1]
orig_text_len = tseq.shape[0]
@ -192,7 +207,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
if wav.shape[-1] != self.max_wav_len:
wav = F.pad(wav, (0, self.max_wav_len - wav.shape[-1]))
# These codes are aligned to audio inputs, so make sure to pad them as well.
aligned_codes = F.pad(aligned_codes, (0, self.max_aligned_codes-aligned_codes.shape[0]))
aligned_codes = F.pad(
aligned_codes, (0, self.max_aligned_codes-aligned_codes.shape[0]))
if tseq.shape[0] != self.max_text_len:
tseq = F.pad(tseq, (0, self.max_text_len - tseq.shape[0]))
@ -223,7 +239,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
return res
def __len__(self):
return self.total_size_bytes // 1000 # 1000 cuts down a TSV file to the actual length pretty well.
# 1000 cuts down a TSV file to the actual length pretty well.
return self.total_size_bytes // 1000
class FastPairedVoiceDebugger:
@ -243,7 +260,8 @@ class FastPairedVoiceDebugger:
if isinstance(state, dict):
self.total_items = opt_get(state, ['total_items'], 0)
self.loaded_items = opt_get(state, ['loaded_items'], 0)
self.self_conditioning_items = opt_get(state, ['self_conditioning_items'], 0)
self.self_conditioning_items = opt_get(
state, ['self_conditioning_items'], 0)
def update(self, batch):
self.total_items += batch['wav'].shape[0]
@ -252,7 +270,8 @@ class FastPairedVoiceDebugger:
for filename in batch['filenames']:
self.unique_files.add(hashlib.sha256(filename.encode('utf-8')))
if 'conditioning' in batch.keys():
self.self_conditioning_items += batch['conditioning_contains_self'].sum().item()
self.self_conditioning_items += batch['conditioning_contains_self'].sum(
).item()
def get_debugging_map(self):
return {
@ -269,13 +288,13 @@ if __name__ == '__main__':
params = {
'mode': 'fast_paired_voice_audio',
'path': ['y:/libritts/train-other-500/transcribed-oco.tsv',
'y:/libritts/train-clean-100/transcribed-oco.tsv',
'y:/libritts/train-clean-360/transcribed-oco.tsv',
'y:/clips/books1/transcribed-oco.tsv',
'y:/clips/books2/transcribed-oco.tsv',
'y:/bigasr_dataset/hifi_tts/transcribed-oco.tsv',
'y:/clips/podcasts-1/transcribed-oco.tsv',],
'types': [0,1,1,1,2,2,0],
'y:/libritts/train-clean-100/transcribed-oco.tsv',
'y:/libritts/train-clean-360/transcribed-oco.tsv',
'y:/clips/books1/transcribed-oco.tsv',
'y:/clips/books2/transcribed-oco.tsv',
'y:/bigasr_dataset/hifi_tts/transcribed-oco.tsv',
'y:/clips/podcasts-1/transcribed-oco.tsv',],
'types': [0, 1, 1, 1, 2, 2, 0],
'phase': 'train',
'n_workers': 0,
'batch_size': batch_sz,
@ -289,11 +308,12 @@ if __name__ == '__main__':
'load_aligned_codes': True,
'produce_ctc_metadata': True,
}
from data import create_dataset, create_dataloader
from data import create_dataloader, create_dataset
def save(b, i, ib, key, c=None):
if c is not None:
torchaudio.save(f'{i}_clip_{ib}_{key}_{c}.wav', b[key][ib][c], 22050)
torchaudio.save(f'{i}_clip_{ib}_{key}_{c}.wav',
b[key][ib][c], 22050)
else:
torchaudio.save(f'{i}_clip_{ib}_{key}.wav', b[key][ib], 22050)
@ -304,8 +324,8 @@ if __name__ == '__main__':
max_pads, max_repeats = 0, 0
for i, b in tqdm(enumerate(dl)):
for ib in range(batch_sz):
#max_pads = max(max_pads, b['ctc_pads'].max())
#max_repeats = max(max_repeats, b['ctc_repeats'].max())
# max_pads = max(max_pads, b['ctc_pads'].max())
# max_repeats = max(max_repeats, b['ctc_repeats'].max())
print(f'{i} {ib} {b["real_text"][ib]}')
save(b, i, ib, 'wav')
save(b, i, ib, 'conditioning', 0)
@ -314,4 +334,3 @@ if __name__ == '__main__':
if i > 15:
break
print(max_pads, max_repeats)

View File

@ -12,13 +12,15 @@ import torchaudio
from tqdm import tqdm
from transformers import Wav2Vec2Processor
from data.audio.paired_voice_audio_dataset import CharacterTokenizer
from data.audio.unsupervised_audio_dataset import load_audio, load_similar_clips
from utils.util import opt_get
from dlas.data.audio.paired_voice_audio_dataset import CharacterTokenizer
from dlas.data.audio.unsupervised_audio_dataset import (load_audio,
load_similar_clips)
from dlas.utils.util import opt_get
def parse_tsv_aligned_codes(line, base_path):
fpt = line.strip().split('\t')
def convert_string_list_to_tensor(strlist):
if strlist.startswith('['):
strlist = strlist[1:]
@ -43,10 +45,12 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
The upshot is that this dataset loads extremely quickly and consumes almost no system memory.
"""
def __init__(self, hparams):
self.paths = hparams['path']
phoneme_paths = hparams['phoneme_paths']
self.paths = [(p, False) for p in self.paths] + [(p, True) for p in phoneme_paths]
self.paths = [(p, False) for p in self.paths] + [(p, True)
for p in phoneme_paths]
self.paths_size_bytes = [os.path.getsize(p) for p, _ in self.paths]
self.total_size_bytes = sum(self.paths_size_bytes)
@ -54,28 +58,36 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
self.normal_text_end_token = hparams['normal_text_end_token']
self.load_conditioning = opt_get(hparams, ['load_conditioning'], False)
self.conditioning_candidates = opt_get(hparams, ['num_conditioning_candidates'], 1)
self.conditioning_length = opt_get(hparams, ['conditioning_length'], 44100)
self.produce_ctc_metadata = opt_get(hparams, ['produce_ctc_metadata'], False)
self.debug_failures = opt_get(hparams, ['debug_loading_failures'], False)
self.conditioning_candidates = opt_get(
hparams, ['num_conditioning_candidates'], 1)
self.conditioning_length = opt_get(
hparams, ['conditioning_length'], 44100)
self.produce_ctc_metadata = opt_get(
hparams, ['produce_ctc_metadata'], False)
self.debug_failures = opt_get(
hparams, ['debug_loading_failures'], False)
self.text_cleaners = hparams.text_cleaners
self.sample_rate = hparams.sample_rate
self.aligned_codes_to_audio_ratio = 443 * self.sample_rate // 22050
self.max_wav_len = opt_get(hparams, ['max_wav_length'], None)
self.load_aligned_codes = opt_get(hparams, ['load_aligned_codes'], False)
self.load_aligned_codes = opt_get(
hparams, ['load_aligned_codes'], False)
if self.max_wav_len is not None:
self.max_aligned_codes = self.max_wav_len // self.aligned_codes_to_audio_ratio
self.max_text_len = opt_get(hparams, ['max_text_length'], None)
assert self.max_wav_len is not None and self.max_text_len is not None
self.use_bpe_tokenizer = opt_get(hparams, ['use_bpe_tokenizer'], False)
if self.use_bpe_tokenizer:
from data.audio.voice_tokenizer import VoiceBpeTokenizer
self.tokenizer = VoiceBpeTokenizer(opt_get(hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json'))
from dlas.data.audio.voice_tokenizer import VoiceBpeTokenizer
self.tokenizer = VoiceBpeTokenizer(opt_get(
hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json'))
else:
self.tokenizer = CharacterTokenizer()
self.ipa_phoneme_tokenizer = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft").tokenizer
self.ipa_phoneme_tokenizer = Wav2Vec2Processor.from_pretrained(
"facebook/wav2vec2-lv-60-espeak-cv-ft").tokenizer
self.ipa_phoneme_tokenizer.do_phonemize = False
self.skipped_items = 0 # records how many items are skipped when accessing an index.
# records how many items are skipped when accessing an index.
self.skipped_items = 0
self.load_times = torch.zeros((256,))
self.load_ind = 0
@ -117,7 +129,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
try: # This can fail when seeking to a UTF-8 escape byte.
f.readline()
except:
return self.load_random_line(depth=depth + 1) # On failure, just recurse and try again.
# On failure, just recurse and try again.
return self.load_random_line(depth=depth + 1)
l2 = f.readline()
if l2:
@ -126,14 +139,16 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
return parse_tsv_aligned_codes(l2, base_path), type, is_phonetic
except:
print(f"error parsing random offset: {sys.exc_info()}")
return self.load_random_line(depth=depth+1) # On failure, just recurse and try again.
# On failure, just recurse and try again.
return self.load_random_line(depth=depth+1)
def get_ctc_metadata(self, codes):
grouped = groupby(codes.tolist())
rcodes, repeats, seps = [], [], [0]
for val, group in grouped:
if val == 0:
seps[-1] = len(list(group)) # This is a very important distinction! It means the padding belongs to the character proceeding it.
# This is a very important distinction! It means the padding belongs to the character proceeding it.
seps[-1] = len(list(group))
else:
rcodes.append(val)
repeats.append(len(list(group)))
@ -149,7 +164,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
if rcodes.shape[0] < self.max_text_len:
gap = self.max_text_len - rcodes.shape[0]
rcodes = F.pad(rcodes, (0, gap))
repeats = F.pad(repeats, (0, gap), value=1) # The minimum value for repeats is 1, hence this is the pad value too.
# The minimum value for repeats is 1, hence this is the pad value too.
repeats = F.pad(repeats, (0, gap), value=1)
seps = F.pad(seps, (0, gap))
elif rcodes.shape[0] > self.max_text_len:
rcodes = rcodes[:self.max_text_len]
@ -171,7 +187,7 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
if text is None or len(text.strip()) == 0:
raise ValueError
cond, cond_is_self = load_similar_clips(apt[0], self.conditioning_length, self.sample_rate,
n=self.conditioning_candidates) if self.load_conditioning else (None, False)
n=self.conditioning_candidates) if self.load_conditioning else (None, False)
except:
if self.skipped_items > 100:
raise # Rethrow if we have nested too far.
@ -185,12 +201,13 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
self.skipped_items = 0
if wav is None or \
(self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len) or \
(self.max_text_len is not None and tseq.shape[0] > self.max_text_len):
(self.max_text_len is not None and tseq.shape[0] > self.max_text_len):
# Basically, this audio file is nonexistent or too long to be supported by the dataset.
# It's hard to handle this situation properly. Best bet is to return the a random valid token and skew the dataset somewhat as a result.
if self.debug_failures:
print(f"error loading {path}: ranges are out of bounds; {wav.shape[-1]}, {tseq.shape[0]}")
rv = random.randint(0,len(self)-1)
print(
f"error loading {path}: ranges are out of bounds; {wav.shape[-1]}, {tseq.shape[0]}")
rv = random.randint(0, len(self)-1)
return self[rv]
# Shift phonetic token and aligned_code tokens over.
@ -206,7 +223,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
if wav.shape[-1] != self.max_wav_len:
wav = F.pad(wav, (0, self.max_wav_len - wav.shape[-1]))
# These codes are aligned to audio inputs, so make sure to pad them as well.
aligned_codes = F.pad(aligned_codes, (0, self.max_aligned_codes-aligned_codes.shape[0]))
aligned_codes = F.pad(
aligned_codes, (0, self.max_aligned_codes-aligned_codes.shape[0]))
if tseq.shape[0] != self.max_text_len:
tseq = F.pad(tseq, (0, self.max_text_len - tseq.shape[0]))
@ -237,7 +255,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
return res
def __len__(self):
return self.total_size_bytes // 1000 # 1000 cuts down a TSV file to the actual length pretty well.
# 1000 cuts down a TSV file to the actual length pretty well.
return self.total_size_bytes // 1000
class FastPairedVoiceDebugger:
@ -257,7 +276,8 @@ class FastPairedVoiceDebugger:
if isinstance(state, dict):
self.total_items = opt_get(state, ['total_items'], 0)
self.loaded_items = opt_get(state, ['loaded_items'], 0)
self.self_conditioning_items = opt_get(state, ['self_conditioning_items'], 0)
self.self_conditioning_items = opt_get(
state, ['self_conditioning_items'], 0)
def update(self, batch):
self.total_items += batch['wav'].shape[0]
@ -266,7 +286,8 @@ class FastPairedVoiceDebugger:
for filename in batch['filenames']:
self.unique_files.add(hashlib.sha256(filename.encode('utf-8')))
if 'conditioning' in batch.keys():
self.self_conditioning_items += batch['conditioning_contains_self'].sum().item()
self.self_conditioning_items += batch['conditioning_contains_self'].sum(
).item()
def get_debugging_map(self):
return {
@ -284,7 +305,7 @@ if __name__ == '__main__':
'mode': 'fast_paired_voice_audio_with_phonemes',
'path': ['y:/libritts/train-clean-100/transcribed-oco.tsv',],
'phoneme_paths': ['y:/libritts/train-other-500/transcribed-phoneme-oco.tsv'],
'types': [0,0],
'types': [0, 0],
'normal_text_end_token': 256,
'phase': 'train',
'n_workers': 0,
@ -299,11 +320,12 @@ if __name__ == '__main__':
'load_aligned_codes': False,
'debug_loading_failures': True,
}
from data import create_dataset, create_dataloader
from data import create_dataloader, create_dataset
def save(b, i, ib, key, c=None):
if c is not None:
torchaudio.save(f'{i}_clip_{ib}_{key}_{c}.wav', b[key][ib][c], 22050)
torchaudio.save(f'{i}_clip_{ib}_{key}_{c}.wav',
b[key][ib][c], 22050)
else:
torchaudio.save(f'{i}_clip_{ib}_{key}.wav', b[key][ib], 22050)
@ -314,14 +336,13 @@ if __name__ == '__main__':
max_pads, max_repeats = 0, 0
for i, b in tqdm(enumerate(dl)):
for ib in range(batch_sz):
#max_pads = max(max_pads, b['ctc_pads'].max())
#max_repeats = max(max_repeats, b['ctc_repeats'].max())
# max_pads = max(max_pads, b['ctc_pads'].max())
# max_repeats = max(max_repeats, b['ctc_repeats'].max())
print(f'{i} {ib} {b["real_text"][ib]}')
#save(b, i, ib, 'wav')
#save(b, i, ib, 'conditioning', 0)
#save(b, i, ib, 'conditioning', 1)
# save(b, i, ib, 'wav')
# save(b, i, ib, 'conditioning', 0)
# save(b, i, ib, 'conditioning', 1)
pass
if i > 15:
break
print(max_pads, max_repeats)

View File

@ -6,9 +6,8 @@ import torch.utils.data
from torch import LongTensor
from tqdm import tqdm
from models.audio.tts.tacotron2 import load_filepaths_and_text
from models.audio.tts.tacotron2 import symbols
from models.audio.tts.tacotron2 import text_to_sequence
from dlas.models.audio.tts.tacotron2 import (load_filepaths_and_text, symbols,
text_to_sequence)
class GptTtsDataset(torch.utils.data.Dataset):
@ -21,7 +20,7 @@ class GptTtsDataset(torch.utils.data.Dataset):
def __init__(self, opt):
self.path = os.path.dirname(opt['path'])
self.audiopaths_and_text = load_filepaths_and_text(opt['path'])
self.text_cleaners=['english_cleaners']
self.text_cleaners = ['english_cleaners']
self.MEL_DICTIONARY_SIZE = opt['mel_vocab_size']+3
self.MEL_START_TOKEN = LongTensor([self.MEL_DICTIONARY_SIZE-3])
@ -32,7 +31,8 @@ class GptTtsDataset(torch.utils.data.Dataset):
audiopath_and_text = self.audiopaths_and_text[index]
audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
text = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
text = torch.cat([self.TEXT_START_TOKEN, text, self.TEXT_STOP_TOKEN], dim=0)
text = torch.cat([self.TEXT_START_TOKEN, text,
self.TEXT_STOP_TOKEN], dim=0)
# Fetch quantized MELs
quant_path = audiopath.replace('wavs/', 'quantized_mels/') + '.pth'
@ -57,8 +57,9 @@ class GptTtsCollater():
def __call__(self, batch):
text_lens = [len(x[0]) for x in batch]
#max_text_len = max(text_lens)
max_text_len = self.MAX_SYMBOLS_PER_PHRASE # This forces all outputs to have the full 200 characters. Testing if this makes a difference.
# max_text_len = max(text_lens)
# This forces all outputs to have the full 200 characters. Testing if this makes a difference.
max_text_len = self.MAX_SYMBOLS_PER_PHRASE
mel_lens = [len(x[1]) for x in batch]
max_mel_len = max(mel_lens)
texts = []
@ -70,7 +71,8 @@ class GptTtsCollater():
text = F.pad(text, (0, max_text_len-len(text)), value=0)
text = torch.where(text == 0, text_range_embedding, text)
texts.append(text)
qmels.append(F.pad(qmel, (0, max_mel_len-len(qmel)), value=self.MEL_PAD_TOKEN))
qmels.append(F.pad(qmel, (0, max_mel_len-len(qmel)),
value=self.MEL_PAD_TOKEN))
filenames = [j[2] for j in batch]
@ -96,7 +98,7 @@ if __name__ == '__main__':
'batch_size': 16,
'mel_vocab_size': 512,
}
from data import create_dataset, create_dataloader
from data import create_dataloader, create_dataset
ds, c = create_dataset(params, return_collate=True)
dl = create_dataloader(ds, params, collate_fn=c)
@ -107,5 +109,5 @@ if __name__ == '__main__':
for b in tqdm(dl):
max_mel = max(max_mel, b['padded_qmel'].shape[2])
max_text = max(max_text, b['padded_text'].shape[1])
m=torch.stack(m)
m = torch.stack(m)
print(m.mean(), m.std())

View File

@ -7,14 +7,15 @@ import torchaudio
from munch import munchify
from tqdm import tqdm
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset
from data.text.hf_datasets_wrapper import HfDataset
from utils.util import opt_get
from dlas.data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset
from dlas.data.text.hf_datasets_wrapper import HfDataset
from dlas.utils.util import opt_get
def build_paired_voice_dataset(args):
from data.audio.paired_voice_audio_dataset import TextWavLoader as D
from models.audio.tts.tacotron2 import create_hparams
from dlas.data.audio.paired_voice_audio_dataset import TextWavLoader as D
default_params = create_hparams()
default_params.update(args)
dataset_opt = munchify(default_params)
@ -33,6 +34,7 @@ class GrandConjoinedDataset(torch.utils.data.Dataset):
Performs tokenization at this level, ignoring any tokenization performed by upstream datasets.
"""
def __init__(self, opt):
sample_rate = 22050 # Fixed.
paired_dataset_args = opt['paired_dataset_args']
@ -47,7 +49,8 @@ class GrandConjoinedDataset(torch.utils.data.Dataset):
self.max_solo_text_length = opt['max_solo_text_length']
self.collate = opt_get(opt, ['needs_collate'], False)
self.sample_rate = sample_rate
self.num_conditioning_candidates = opt_get(opt, ['num_conditioning_candidates'], 0)
self.num_conditioning_candidates = opt_get(
opt, ['num_conditioning_candidates'], 0)
self.conditioning_length = opt_get(opt, ['conditioning_length'], 44000)
load_conditioning = self.num_conditioning_candidates > 0
@ -75,7 +78,8 @@ class GrandConjoinedDataset(torch.utils.data.Dataset):
def fetch_text_at(self, i):
try:
txt = self.text[i % len(self.text)]['text']
assert '*' not in txt # This is a hack to get around the use of '*' to mask expletives in some text-only datasets. There really isn't a linguistic use for this character anyways.
# This is a hack to get around the use of '*' to mask expletives in some text-only datasets. There really isn't a linguistic use for this character anyways.
assert '*' not in txt
tok = self.speech_and_text.get_text(txt)
padding_required = self.max_solo_text_length - tok.shape[0]
if padding_required < 0:
@ -137,7 +141,8 @@ class GrandConjoinedDataset(torch.utils.data.Dataset):
sp = self.speech[i % len(self.speech)]
# Set upper bound on solo speech lengths. This is handled automatically when collation is turned off, but needs to be done otherwise.
sp['clip'] = sp['clip'][:, :self.max_solo_audio_length]
sp['clip_lengths'] = sp['clip_lengths'].clamp(0, self.max_solo_audio_length)
sp['clip_lengths'] = sp['clip_lengths'].clamp(
0, self.max_solo_audio_length)
return self.optionally_add_conditioning_candidates({
'paired_audio': snt['wav'],
'paired_audio_lengths': snt['wav_lengths'],
@ -205,7 +210,7 @@ if __name__ == '__main__':
'use_bpe_tokenizer': False,
},
}
from data import create_dataset, create_dataloader
from data import create_dataloader, create_dataset
os.remove('test_cache_delete_me2.pth')
ds, c = create_dataset(train_params, return_collate=True)
@ -213,7 +218,8 @@ if __name__ == '__main__':
def save(b, i, ib, key, c=None):
if c is not None:
torchaudio.save(f'{i}_clip_{ib}_{key}_{c}.wav', b[key][ib][c], 22050)
torchaudio.save(f'{i}_clip_{ib}_{key}_{c}.wav',
b[key][ib][c], 22050)
else:
torchaudio.save(f'{i}_clip_{ib}_{key}.wav', b[key][ib], 22050)
@ -224,16 +230,17 @@ if __name__ == '__main__':
m = None
for i, b in tqdm(enumerate(dl)):
for ib in range(batch_sz):
#save(b, i, ib, 'paired_audio')
#save(b, i, ib, 'paired_audio_conditioning', 0)
#save(b, i, ib, 'paired_audio_conditioning', 1)
print(f'Paired file: {b["paired_file"][ib]} text: {b["paired_text"][ib]}')
print(f'Paired text decoded: {decode(b, ib, "paired_text_tokens")}')
#save(b, i, ib, 'speech_audio')
#save(b, i, ib, 'speech_audio_conditioning', 0)
#save(b, i, ib, 'speech_audio_conditioning', 1)
#print(f'Text: {b["text_text"][ib]}')
#print(f'Text decoded: {decode(b, ib, "text_tokens")}')
# save(b, i, ib, 'paired_audio')
# save(b, i, ib, 'paired_audio_conditioning', 0)
# save(b, i, ib, 'paired_audio_conditioning', 1)
print(
f'Paired file: {b["paired_file"][ib]} text: {b["paired_text"][ib]}')
print(
f'Paired text decoded: {decode(b, ib, "paired_text_tokens")}')
# save(b, i, ib, 'speech_audio')
# save(b, i, ib, 'speech_audio_conditioning', 0)
# save(b, i, ib, 'speech_audio_conditioning', 1)
# print(f'Text: {b["text_text"][ib]}')
# print(f'Text decoded: {decode(b, ib, "text_tokens")}')
if i > 5:
break

View File

@ -7,32 +7,36 @@ import torch.utils.data
import torchaudio
from tqdm import tqdm
from data.audio.unsupervised_audio_dataset import load_audio
from data.util import find_files_of_type, is_audio_file
from models.audio.tts.tacotron2 import load_filepaths_and_text
from models.audio.tts.tacotron2 import text_to_sequence
from utils.util import opt_get
from dlas.data.audio.unsupervised_audio_dataset import load_audio
from dlas.data.util import find_files_of_type, is_audio_file
from dlas.models.audio.tts.tacotron2 import (load_filepaths_and_text,
text_to_sequence)
from dlas.utils.util import opt_get
def load_tsv(filename):
with open(filename, encoding='utf-8') as f:
components = [line.strip().split('\t') for line in f]
base = os.path.dirname(filename)
filepaths_and_text = [[os.path.join(base, f'{component[1]}'), component[0]] for component in components]
filepaths_and_text = [
[os.path.join(base, f'{component[1]}'), component[0]] for component in components]
return filepaths_and_text
def load_mozilla_cv(filename):
with open(filename, encoding='utf-8') as f:
components = [line.strip().split('\t') for line in f][1:] # First line is the header
components = [line.strip().split('\t')
for line in f][1:] # First line is the header
base = os.path.dirname(filename)
filepaths_and_text = [[os.path.join(base, f'clips/{component[1]}'), component[2]] for component in components]
filepaths_and_text = [[os.path.join(
base, f'clips/{component[1]}'), component[2]] for component in components]
return filepaths_and_text
def load_voxpopuli(filename):
with open(filename, encoding='utf-8') as f:
lines = [line.strip().split('\t') for line in f][1:] # First line is the header
lines = [line.strip().split('\t')
for line in f][1:] # First line is the header
base = os.path.dirname(filename)
filepaths_and_text = []
for line in lines:
@ -40,7 +44,8 @@ def load_voxpopuli(filename):
continue
file, raw_text, norm_text, speaker_id, split, gender = line
year = file[:4]
filepaths_and_text.append([os.path.join(base, year, f'{file}.ogg.wav'), raw_text])
filepaths_and_text.append(
[os.path.join(base, year, f'{file}.ogg.wav'), raw_text])
return filepaths_and_text
@ -56,8 +61,10 @@ class TextWavLoader(torch.utils.data.Dataset):
assert len(self.path) == len(fetcher_mode)
self.load_conditioning = opt_get(hparams, ['load_conditioning'], False)
self.conditioning_candidates = opt_get(hparams, ['num_conditioning_candidates'], 3)
self.conditioning_length = opt_get(hparams, ['conditioning_length'], 44100)
self.conditioning_candidates = opt_get(
hparams, ['num_conditioning_candidates'], 3)
self.conditioning_length = opt_get(
hparams, ['conditioning_length'], 44100)
self.audiopaths_and_text = []
for p, fm in zip(self.path, fetcher_mode):
if fm == 'lj' or fm == 'libritts':
@ -65,10 +72,12 @@ class TextWavLoader(torch.utils.data.Dataset):
elif fm == 'tsv':
fetcher_fn = load_tsv
elif fm == 'mozilla_cv':
assert not self.load_conditioning # Conditioning inputs are incompatible with mozilla_cv
# Conditioning inputs are incompatible with mozilla_cv
assert not self.load_conditioning
fetcher_fn = load_mozilla_cv
elif fm == 'voxpopuli':
assert not self.load_conditioning # Conditioning inputs are incompatible with voxpopuli