PIP-ified (credit to https://git.ecker.tech/eschmidbauer)
This commit is contained in:
parent
fe24641763
commit
a4afad8837
72
.gitignore
vendored
72
.gitignore
vendored
|
@ -1,30 +1,3 @@
|
|||
dlas/experiments/*
|
||||
dlas/codes/*.txt
|
||||
dlas/codes/wandb/*
|
||||
dlas/codes/pretrained_models/*
|
||||
dlas/codes/scripts/audio/pretrained_models/*
|
||||
|
||||
results/*
|
||||
tb_logger/*
|
||||
datasets/*
|
||||
options/*
|
||||
data/*
|
||||
.vscode
|
||||
|
||||
*.html
|
||||
*.png
|
||||
*.jpg
|
||||
*.gif
|
||||
*.pth
|
||||
*.pytorch
|
||||
*.zip
|
||||
*.cu
|
||||
*.pt
|
||||
*.pth
|
||||
*.pdf
|
||||
*.tsv
|
||||
|
||||
# template
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
@ -36,6 +9,7 @@ __pycache__/
|
|||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
env/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
|
@ -47,12 +21,9 @@ lib64/
|
|||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pretrained/*
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
|
@ -72,9 +43,8 @@ htmlcov/
|
|||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
|
@ -83,14 +53,6 @@ coverage.xml
|
|||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
@ -98,36 +60,8 @@ docs/_build/
|
|||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
#Ipython Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
|
|
@ -1 +1 @@
|
|||
recursive-include codes/*
|
||||
recursive-include dlas/*
|
||||
|
|
|
@ -3,7 +3,7 @@ import torch
|
|||
import torch.utils.data
|
||||
from munch import munchify
|
||||
|
||||
from utils.util import opt_get
|
||||
from dlas.utils.util import opt_get
|
||||
|
||||
|
||||
def create_dataloader(dataset, dataset_opt, opt=None, sampler=None, collate_fn=None, shuffle=True):
|
||||
|
@ -33,77 +33,90 @@ def create_dataset(dataset_opt, return_collate=False):
|
|||
|
||||
# datasets for image restoration
|
||||
if mode == 'fullimage':
|
||||
from data.images.full_image_dataset import FullImageDataset as D
|
||||
from dlas.data.images.full_image_dataset import FullImageDataset as D
|
||||
elif mode == 'single_image_extensible':
|
||||
from data.images.single_image_dataset import SingleImageDataset as D
|
||||
from dlas.data.images.single_image_dataset import \
|
||||
SingleImageDataset as D
|
||||
elif mode == 'multi_frame_extensible':
|
||||
from data.images.multi_frame_dataset import MultiFrameDataset as D
|
||||
from dlas.data.images.multi_frame_dataset import MultiFrameDataset as D
|
||||
elif mode == 'combined':
|
||||
from data.combined_dataset import CombinedDataset as D
|
||||
from dlas.data.combined_dataset import CombinedDataset as D
|
||||
elif mode == 'multiscale':
|
||||
from data.images.multiscale_dataset import MultiScaleDataset as D
|
||||
from dlas.data.images.multiscale_dataset import MultiScaleDataset as D
|
||||
elif mode == 'paired_frame':
|
||||
from data.images.paired_frame_dataset import PairedFrameDataset as D
|
||||
from dlas.data.images.paired_frame_dataset import \
|
||||
PairedFrameDataset as D
|
||||
elif mode == 'stylegan2':
|
||||
from data.images.stylegan2_dataset import Stylegan2Dataset as D
|
||||
from dlas.data.images.stylegan2_dataset import Stylegan2Dataset as D
|
||||
elif mode == 'imagefolder':
|
||||
from data.images.image_folder_dataset import ImageFolderDataset as D
|
||||
from dlas.data.images.image_folder_dataset import \
|
||||
ImageFolderDataset as D
|
||||
elif mode == 'torch_dataset':
|
||||
from data.torch_dataset import TorchDataset as D
|
||||
elif mode == 'byol_dataset':
|
||||
from data.images.byol_attachment import ByolDatasetWrapper as D
|
||||
from dlas.data.images.byol_attachment import ByolDatasetWrapper as D
|
||||
elif mode == 'byol_structured_dataset':
|
||||
from data.images.byol_attachment import StructuredCropDatasetWrapper as D
|
||||
from dlas.data.images.byol_attachment import \
|
||||
StructuredCropDatasetWrapper as D
|
||||
elif mode == 'random_aug_wrapper':
|
||||
from data.images.byol_attachment import DatasetRandomAugWrapper as D
|
||||
from dlas.data.images.byol_attachment import \
|
||||
DatasetRandomAugWrapper as D
|
||||
elif mode == 'random_dataset':
|
||||
from data.images.random_dataset import RandomDataset as D
|
||||
from dlas.data.images.random_dataset import RandomDataset as D
|
||||
elif mode == 'zipfile':
|
||||
from data.images.zip_file_dataset import ZipFileDataset as D
|
||||
from dlas.data.images.zip_file_dataset import ZipFileDataset as D
|
||||
elif mode == 'nv_tacotron':
|
||||
from data.audio.nv_tacotron_dataset import TextWavLoader as D
|
||||
from data.audio.nv_tacotron_dataset import TextMelCollate as C
|
||||
from models.audio.tts.tacotron2 import create_hparams
|
||||
from dlas.data.audio.nv_tacotron_dataset import TextMelCollate as C
|
||||
from dlas.data.audio.nv_tacotron_dataset import TextWavLoader as D
|
||||
from dlas.models.audio.tts.tacotron2 import create_hparams
|
||||
default_params = create_hparams()
|
||||
default_params.update(dataset_opt)
|
||||
dataset_opt = munchify(default_params)
|
||||
if opt_get(dataset_opt, ['needs_collate'], True):
|
||||
collate = C()
|
||||
elif mode == 'paired_voice_audio':
|
||||
from data.audio.paired_voice_audio_dataset import TextWavLoader as D
|
||||
from models.audio.tts.tacotron2 import create_hparams
|
||||
from dlas.data.audio.paired_voice_audio_dataset import \
|
||||
TextWavLoader as D
|
||||
from dlas.models.audio.tts.tacotron2 import create_hparams
|
||||
default_params = create_hparams()
|
||||
default_params.update(dataset_opt)
|
||||
dataset_opt = munchify(default_params)
|
||||
elif mode == 'fast_paired_voice_audio':
|
||||
from data.audio.fast_paired_dataset import FastPairedVoiceDataset as D
|
||||
from models.audio.tts.tacotron2 import create_hparams
|
||||
from dlas.data.audio.fast_paired_dataset import \
|
||||
FastPairedVoiceDataset as D
|
||||
from dlas.models.audio.tts.tacotron2 import create_hparams
|
||||
default_params = create_hparams()
|
||||
default_params.update(dataset_opt)
|
||||
dataset_opt = munchify(default_params)
|
||||
elif mode == 'fast_paired_voice_audio_with_phonemes':
|
||||
from data.audio.fast_paired_dataset_with_phonemes import FastPairedVoiceDataset as D
|
||||
from models.audio.tts.tacotron2 import create_hparams
|
||||
from dlas.data.audio.fast_paired_dataset_with_phonemes import \
|
||||
FastPairedVoiceDataset as D
|
||||
from dlas.models.audio.tts.tacotron2 import create_hparams
|
||||
default_params = create_hparams()
|
||||
default_params.update(dataset_opt)
|
||||
dataset_opt = munchify(default_params)
|
||||
elif mode == 'gpt_tts':
|
||||
from data.audio.gpt_tts_dataset import GptTtsDataset as D
|
||||
from data.audio.gpt_tts_dataset import GptTtsCollater as C
|
||||
from dlas.data.audio.gpt_tts_dataset import GptTtsCollater as C
|
||||
from dlas.data.audio.gpt_tts_dataset import GptTtsDataset as D
|
||||
collate = C(dataset_opt)
|
||||
elif mode == 'unsupervised_audio':
|
||||
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset as D
|
||||
from dlas.data.audio.unsupervised_audio_dataset import \
|
||||
UnsupervisedAudioDataset as D
|
||||
elif mode == 'unsupervised_audio_with_noise':
|
||||
from data.audio.audio_with_noise_dataset import AudioWithNoiseDataset as D
|
||||
from dlas.data.audio.audio_with_noise_dataset import \
|
||||
AudioWithNoiseDataset as D
|
||||
elif mode == 'preprocessed_mel':
|
||||
from data.audio.preprocessed_mel_dataset import PreprocessedMelDataset as D
|
||||
from dlas.data.audio.preprocessed_mel_dataset import \
|
||||
PreprocessedMelDataset as D
|
||||
elif mode == 'grand_conjoined_voice':
|
||||
from data.audio.grand_conjoined_dataset import GrandConjoinedDataset as D
|
||||
from data.zero_pad_dict_collate import ZeroPadDictCollate as C
|
||||
from dlas.data.audio.grand_conjoined_dataset import \
|
||||
GrandConjoinedDataset as D
|
||||
from dlas.data.zero_pad_dict_collate import ZeroPadDictCollate as C
|
||||
if opt_get(dataset_opt, ['needs_collate'], False):
|
||||
collate = C()
|
||||
else:
|
||||
raise NotImplementedError('Dataset [{:s}] is not recognized.'.format(mode))
|
||||
raise NotImplementedError(
|
||||
'Dataset [{:s}] is not recognized.'.format(mode))
|
||||
dataset = D(dataset_opt)
|
||||
|
||||
if return_collate:
|
||||
|
@ -115,9 +128,10 @@ def create_dataset(dataset_opt, return_collate=False):
|
|||
def get_dataset_debugger(dataset_opt):
|
||||
mode = dataset_opt['mode']
|
||||
if mode == 'paired_voice_audio':
|
||||
from data.audio.paired_voice_audio_dataset import PairedVoiceDebugger
|
||||
from dlas.data.audio.paired_voice_audio_dataset import \
|
||||
PairedVoiceDebugger
|
||||
return PairedVoiceDebugger()
|
||||
elif mode == 'fast_paired_voice_audio':
|
||||
from data.audio.fast_paired_dataset import FastPairedVoiceDebugger
|
||||
from dlas.data.audio.fast_paired_dataset import FastPairedVoiceDebugger
|
||||
return FastPairedVoiceDebugger()
|
||||
return None
|
||||
return None
|
||||
|
|
0
dlas/data/audio/__init__.py
Normal file
0
dlas/data/audio/__init__.py
Normal file
|
@ -2,18 +2,17 @@ import random
|
|||
import sys
|
||||
from math import pi
|
||||
|
||||
import librosa
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torchaudio
|
||||
from torch.utils.data import Dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
import torch.nn.functional as F
|
||||
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset, load_audio
|
||||
from data.util import load_paths_from_cache, find_files_of_type, is_audio_file
|
||||
|
||||
# Just all ones.
|
||||
from utils.util import opt_get
|
||||
from dlas.data.audio.unsupervised_audio_dataset import (
|
||||
UnsupervisedAudioDataset, load_audio)
|
||||
from dlas.data.util import (find_files_of_type, is_audio_file,
|
||||
load_paths_from_cache)
|
||||
from dlas.utils.util import opt_get
|
||||
|
||||
|
||||
def _integration_fn_fully_enabled(n):
|
||||
|
@ -23,7 +22,7 @@ def _integration_fn_fully_enabled(n):
|
|||
# Randomly assigns up to 5 blocks of the output tensor the value '1'. Rest is zero
|
||||
def _integration_fn_spiky(n):
|
||||
fn = torch.zeros((n,))
|
||||
spikes = random.randint(1,5)
|
||||
spikes = random.randint(1, 5)
|
||||
for _ in range(spikes):
|
||||
sz = random.randint(n//8, n//2)
|
||||
pos = random.randint(0, n)
|
||||
|
@ -35,18 +34,19 @@ def _integration_fn_spiky(n):
|
|||
# Uses a sinusoidal ramp up and down (of random length) to a peak which is held for a random duration.
|
||||
def _integration_fn_smooth(n):
|
||||
center = random.randint(1, n-2)
|
||||
max_duration=n-center-1
|
||||
max_duration = n-center-1
|
||||
duration = random.randint(max_duration//4, max_duration)
|
||||
end = center+duration
|
||||
|
||||
ramp_up_sz = random.randint(n//16,n//4)
|
||||
ramp_up = torch.sin(pi*torch.arange(0,ramp_up_sz)/(2*ramp_up_sz))
|
||||
ramp_up_sz = random.randint(n//16, n//4)
|
||||
ramp_up = torch.sin(pi*torch.arange(0, ramp_up_sz)/(2*ramp_up_sz))
|
||||
if ramp_up_sz > center:
|
||||
ramp_up = ramp_up[(ramp_up_sz-center):]
|
||||
ramp_up_sz = center
|
||||
|
||||
ramp_down_sz = random.randint(n//16,n//4)
|
||||
ramp_down = torch.flip(torch.sin(pi*torch.arange(0,ramp_down_sz)/(2*ramp_down_sz)), dims=[0])
|
||||
ramp_down_sz = random.randint(n//16, n//4)
|
||||
ramp_down = torch.flip(
|
||||
torch.sin(pi*torch.arange(0, ramp_down_sz)/(2*ramp_down_sz)), dims=[0])
|
||||
if ramp_down_sz > (n-end):
|
||||
ramp_down = ramp_down[:(n-end)]
|
||||
ramp_down_sz = n-end
|
||||
|
@ -71,16 +71,22 @@ def load_rir(path, sr, max_sz):
|
|||
Wraps a unsupervised_audio_dataset and applies noise to the output clips, then provides labels depending on what
|
||||
noise was added.
|
||||
'''
|
||||
|
||||
|
||||
class AudioWithNoiseDataset(Dataset):
|
||||
def __init__(self, opt):
|
||||
self.underlying_dataset = UnsupervisedAudioDataset(opt)
|
||||
self.env_noise_paths = load_paths_from_cache(opt['env_noise_paths'], opt['env_noise_cache'])
|
||||
self.music_paths = load_paths_from_cache(opt['music_paths'], opt['music_cache'])
|
||||
self.openair_paths = find_files_of_type('img', opt['openair_path'], qualifier=is_audio_file)[0]
|
||||
self.env_noise_paths = load_paths_from_cache(
|
||||
opt['env_noise_paths'], opt['env_noise_cache'])
|
||||
self.music_paths = load_paths_from_cache(
|
||||
opt['music_paths'], opt['music_cache'])
|
||||
self.openair_paths = find_files_of_type(
|
||||
'img', opt['openair_path'], qualifier=is_audio_file)[0]
|
||||
self.min_volume = opt_get(opt, ['min_noise_volume'], .2)
|
||||
self.max_volume = opt_get(opt, ['max_noise_volume'], .5)
|
||||
self.sampling_rate = self.underlying_dataset.sampling_rate
|
||||
self.use_gpu_for_reverb_compute = opt_get(opt, ['use_gpu_for_reverb_compute'], True)
|
||||
self.use_gpu_for_reverb_compute = opt_get(
|
||||
opt, ['use_gpu_for_reverb_compute'], True)
|
||||
self.openair_kernels = None
|
||||
self.current_item_fetch = 0
|
||||
self.fetch_error_count = 0
|
||||
|
@ -90,7 +96,8 @@ class AudioWithNoiseDataset(Dataset):
|
|||
# Load the openair reverbs as CUDA tensors.
|
||||
self.openair_kernels = []
|
||||
for oa in self.openair_paths:
|
||||
self.openair_kernels.append(load_rir(oa, self.underlying_dataset.sampling_rate, self.underlying_dataset.sampling_rate*2).cuda())
|
||||
self.openair_kernels.append(load_rir(
|
||||
oa, self.underlying_dataset.sampling_rate, self.underlying_dataset.sampling_rate*2).cuda())
|
||||
|
||||
def __getitem__(self, item):
|
||||
if self.current_item_fetch != item:
|
||||
|
@ -113,10 +120,11 @@ class AudioWithNoiseDataset(Dataset):
|
|||
clip = clip * clipvol
|
||||
|
||||
label = random.randint(0, 4) # Current excludes GSM corruption.
|
||||
#label = 3
|
||||
# label = 3
|
||||
if label > 0 and label < 4: # 0 is basically "leave it alone"
|
||||
aug_needed = True
|
||||
augvol = (random.random() * (self.max_volume-self.min_volume) + self.min_volume)
|
||||
augvol = (random.random() * (self.max_volume -
|
||||
self.min_volume) + self.min_volume)
|
||||
if label == 1:
|
||||
# Add environmental noise.
|
||||
augpath = random.choice(self.env_noise_paths)
|
||||
|
@ -131,13 +139,15 @@ class AudioWithNoiseDataset(Dataset):
|
|||
# This can take two forms:
|
||||
if padding_room < 22000 or random.random() < .5:
|
||||
# (1) The voices talk over one another. If there is no padding room, we always take this choice.
|
||||
intg_fns = [_integration_fn_smooth, _integration_fn_fully_enabled]
|
||||
intg_fns = [_integration_fn_smooth,
|
||||
_integration_fn_fully_enabled]
|
||||
else:
|
||||
# (2) There are simply two voices in the clip, separated from one another.
|
||||
# This is a special case that does not use the same logic as the rest of the augmentations.
|
||||
aug = load_audio(augpath, self.underlying_dataset.sampling_rate)
|
||||
aug = load_audio(
|
||||
augpath, self.underlying_dataset.sampling_rate)
|
||||
# Pad with some random silence
|
||||
aug = F.pad(aug, (random.randint(20,4000), 0))
|
||||
aug = F.pad(aug, (random.randint(20, 4000), 0))
|
||||
# Fit what we can given the padding room we have.
|
||||
aug = aug[:, :padding_room]
|
||||
clip = torch.cat([clip, aug], dim=1)
|
||||
|
@ -146,7 +156,8 @@ class AudioWithNoiseDataset(Dataset):
|
|||
out['clip_lengths'] = torch.tensor(clip.shape[-1])
|
||||
aug_needed = False
|
||||
if aug_needed:
|
||||
aug = load_audio(augpath, self.underlying_dataset.sampling_rate)
|
||||
aug = load_audio(
|
||||
augpath, self.underlying_dataset.sampling_rate)
|
||||
if aug.shape[1] > clip.shape[1]:
|
||||
n, cn = aug.shape[1], clip.shape[1]
|
||||
gap = n-cn
|
||||
|
@ -157,7 +168,8 @@ class AudioWithNoiseDataset(Dataset):
|
|||
if aug.shape[1] < clip.shape[1]:
|
||||
gap = clip.shape[1] - aug.shape[1]
|
||||
placement = random.randint(0, gap-1)
|
||||
aug = torch.nn.functional.pad(aug, (placement, gap-placement))
|
||||
aug = torch.nn.functional.pad(
|
||||
aug, (placement, gap-placement))
|
||||
clip = clip + aug
|
||||
elif label == 4:
|
||||
# Perform reverb (to simulate being in a large room with an omni-mic). This is performed by convolving
|
||||
|
@ -166,19 +178,23 @@ class AudioWithNoiseDataset(Dataset):
|
|||
rir = random.choice(self.openair_kernels)
|
||||
else:
|
||||
augpath = random.choice(self.openair_paths)
|
||||
rir = load_rir(augpath, self.underlying_dataset.sampling_rate, clip.shape[-1])
|
||||
rir = load_rir(
|
||||
augpath, self.underlying_dataset.sampling_rate, clip.shape[-1])
|
||||
clip = torch.nn.functional.pad(clip, (rir.shape[1]-1, 0))
|
||||
if self.use_gpu_for_reverb_compute:
|
||||
clip = clip.cuda()
|
||||
clip = torch.nn.functional.conv1d(clip.unsqueeze(0), rir.unsqueeze(0)).squeeze(0).cpu()
|
||||
clip = torch.nn.functional.conv1d(
|
||||
clip.unsqueeze(0), rir.unsqueeze(0)).squeeze(0).cpu()
|
||||
elif label == 5:
|
||||
# Apply the GSM codec to simulate cellular phone audio.
|
||||
clip = torchaudio.functional.apply_codec(clip, self.underlying_dataset.sampling_rate, format="gsm")
|
||||
clip = torchaudio.functional.apply_codec(
|
||||
clip, self.underlying_dataset.sampling_rate, format="gsm")
|
||||
except:
|
||||
if self.fetch_error_count > 10:
|
||||
print(f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
|
||||
print(
|
||||
f"Exception encountered processing {item}, re-trying because this is often just a failed aug.")
|
||||
print(sys.exc_info())
|
||||
#raise # Uncomment to surface exceptions.
|
||||
# raise # Uncomment to surface exceptions.
|
||||
self.fetch_error_count += 1
|
||||
return self[item]
|
||||
|
||||
|
@ -187,7 +203,7 @@ class AudioWithNoiseDataset(Dataset):
|
|||
clip = F.pad(clip, (0, padding_room))
|
||||
out['clip'] = clip
|
||||
out['label'] = label
|
||||
#out['aug'] = aug
|
||||
# out['aug'] = aug
|
||||
out['augpath'] = augpath
|
||||
out['augvol'] = augvol
|
||||
out['clipvol'] = clipvol
|
||||
|
@ -216,14 +232,15 @@ if __name__ == '__main__':
|
|||
'openair_path': 'D:\\data\\audio\\openair\\resampled',
|
||||
'use_gpu_for_reverb_compute': False,
|
||||
}
|
||||
from data import create_dataset, create_dataloader, util
|
||||
from data import create_dataloader, create_dataset, util
|
||||
|
||||
ds = create_dataset(params)
|
||||
dl = create_dataloader(ds, params, pin_memory=False)
|
||||
i = 0
|
||||
for b in tqdm(dl):
|
||||
for b_ in range(b['clip'].shape[0]):
|
||||
#torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_][:, :b['clip_lengths'][b_]], ds.sampling_rate)
|
||||
#torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
|
||||
print(f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
|
||||
# torchaudio.save(f'{i}_clip_{b_}_{b["label"][b_].item()}.wav', b['clip'][b_][:, :b['clip_lengths'][b_]], ds.sampling_rate)
|
||||
# torchaudio.save(f'{i}_clip_{b_}_aug.wav', b['aug'][b_], ds.sampling_rate)
|
||||
print(
|
||||
f'{i} aug path: {b["augpath"][b_]} aug volume: {b["augvol"][b_]} clip volume: {b["clipvol"][b_]}')
|
||||
i += 1
|
||||
|
|
|
@ -12,13 +12,15 @@ import torchaudio
|
|||
from tqdm import tqdm
|
||||
from transformers import Wav2Vec2CTCTokenizer
|
||||
|
||||
from data.audio.paired_voice_audio_dataset import CharacterTokenizer
|
||||
from data.audio.unsupervised_audio_dataset import load_audio, load_similar_clips
|
||||
from utils.util import opt_get
|
||||
from dlas.data.audio.paired_voice_audio_dataset import CharacterTokenizer
|
||||
from dlas.data.audio.unsupervised_audio_dataset import (load_audio,
|
||||
load_similar_clips)
|
||||
from dlas.utils.util import opt_get
|
||||
|
||||
|
||||
def parse_tsv_aligned_codes(line, base_path):
|
||||
fpt = line.strip().split('\t')
|
||||
|
||||
def convert_string_list_to_tensor(strlist):
|
||||
if strlist.startswith('['):
|
||||
strlist = strlist[1:]
|
||||
|
@ -43,6 +45,7 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
|
||||
The upshot is that this dataset loads extremely quickly and consumes almost no system memory.
|
||||
"""
|
||||
|
||||
def __init__(self, hparams):
|
||||
self.paths = hparams['path']
|
||||
if not isinstance(self.paths, list):
|
||||
|
@ -52,26 +55,33 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
self.types = opt_get(hparams, ['types'], [0 for _ in self.paths])
|
||||
|
||||
self.load_conditioning = opt_get(hparams, ['load_conditioning'], False)
|
||||
self.conditioning_candidates = opt_get(hparams, ['num_conditioning_candidates'], 1)
|
||||
self.conditioning_length = opt_get(hparams, ['conditioning_length'], 44100)
|
||||
self.produce_ctc_metadata = opt_get(hparams, ['produce_ctc_metadata'], False)
|
||||
self.debug_failures = opt_get(hparams, ['debug_loading_failures'], False)
|
||||
self.conditioning_candidates = opt_get(
|
||||
hparams, ['num_conditioning_candidates'], 1)
|
||||
self.conditioning_length = opt_get(
|
||||
hparams, ['conditioning_length'], 44100)
|
||||
self.produce_ctc_metadata = opt_get(
|
||||
hparams, ['produce_ctc_metadata'], False)
|
||||
self.debug_failures = opt_get(
|
||||
hparams, ['debug_loading_failures'], False)
|
||||
self.text_cleaners = hparams.text_cleaners
|
||||
self.sample_rate = hparams.sample_rate
|
||||
self.aligned_codes_to_audio_ratio = 443 * self.sample_rate // 22050
|
||||
self.max_wav_len = opt_get(hparams, ['max_wav_length'], None)
|
||||
self.load_aligned_codes = opt_get(hparams, ['load_aligned_codes'], False)
|
||||
self.load_aligned_codes = opt_get(
|
||||
hparams, ['load_aligned_codes'], False)
|
||||
if self.max_wav_len is not None:
|
||||
self.max_aligned_codes = self.max_wav_len // self.aligned_codes_to_audio_ratio
|
||||
self.max_text_len = opt_get(hparams, ['max_text_length'], None)
|
||||
assert self.max_wav_len is not None and self.max_text_len is not None
|
||||
self.use_bpe_tokenizer = opt_get(hparams, ['use_bpe_tokenizer'], False)
|
||||
if self.use_bpe_tokenizer:
|
||||
from data.audio.voice_tokenizer import VoiceBpeTokenizer
|
||||
self.tokenizer = VoiceBpeTokenizer(opt_get(hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json'))
|
||||
from dlas.data.audio.voice_tokenizer import VoiceBpeTokenizer
|
||||
self.tokenizer = VoiceBpeTokenizer(opt_get(
|
||||
hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json'))
|
||||
else:
|
||||
self.tokenizer = CharacterTokenizer()
|
||||
self.skipped_items = 0 # records how many items are skipped when accessing an index.
|
||||
# records how many items are skipped when accessing an index.
|
||||
self.skipped_items = 0
|
||||
|
||||
self.load_times = torch.zeros((256,))
|
||||
self.load_ind = 0
|
||||
|
@ -110,7 +120,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
try: # This can fail when seeking to a UTF-8 escape byte.
|
||||
f.readline()
|
||||
except:
|
||||
return self.load_random_line(depth=depth + 1), type # On failure, just recurse and try again.
|
||||
# On failure, just recurse and try again.
|
||||
return self.load_random_line(depth=depth + 1), type
|
||||
l2 = f.readline()
|
||||
|
||||
if l2:
|
||||
|
@ -119,14 +130,16 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
return parse_tsv_aligned_codes(l2, base_path), type
|
||||
except:
|
||||
print(f"error parsing random offset: {sys.exc_info()}")
|
||||
return self.load_random_line(depth=depth+1), type # On failure, just recurse and try again.
|
||||
# On failure, just recurse and try again.
|
||||
return self.load_random_line(depth=depth+1), type
|
||||
|
||||
def get_ctc_metadata(self, codes):
|
||||
grouped = groupby(codes.tolist())
|
||||
rcodes, repeats, seps = [], [], [0]
|
||||
for val, group in grouped:
|
||||
if val == 0:
|
||||
seps[-1] = len(list(group)) # This is a very important distinction! It means the padding belongs to the character proceeding it.
|
||||
# This is a very important distinction! It means the padding belongs to the character proceeding it.
|
||||
seps[-1] = len(list(group))
|
||||
else:
|
||||
rcodes.append(val)
|
||||
repeats.append(len(list(group)))
|
||||
|
@ -142,7 +155,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
if rcodes.shape[0] < self.max_text_len:
|
||||
gap = self.max_text_len - rcodes.shape[0]
|
||||
rcodes = F.pad(rcodes, (0, gap))
|
||||
repeats = F.pad(repeats, (0, gap), value=1) # The minimum value for repeats is 1, hence this is the pad value too.
|
||||
# The minimum value for repeats is 1, hence this is the pad value too.
|
||||
repeats = F.pad(repeats, (0, gap), value=1)
|
||||
seps = F.pad(seps, (0, gap))
|
||||
elif rcodes.shape[0] > self.max_text_len:
|
||||
rcodes = rcodes[:self.max_text_len]
|
||||
|
@ -165,7 +179,7 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
if text is None or len(text.strip()) == 0:
|
||||
raise ValueError
|
||||
cond, cond_is_self = load_similar_clips(apt[0], self.conditioning_length, self.sample_rate,
|
||||
n=self.conditioning_candidates) if self.load_conditioning else (None, False)
|
||||
n=self.conditioning_candidates) if self.load_conditioning else (None, False)
|
||||
except:
|
||||
if self.skipped_items > 100:
|
||||
raise # Rethrow if we have nested too far.
|
||||
|
@ -179,12 +193,13 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
self.skipped_items = 0
|
||||
if wav is None or \
|
||||
(self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len) or \
|
||||
(self.max_text_len is not None and tseq.shape[0] > self.max_text_len):
|
||||
(self.max_text_len is not None and tseq.shape[0] > self.max_text_len):
|
||||
# Basically, this audio file is nonexistent or too long to be supported by the dataset.
|
||||
# It's hard to handle this situation properly. Best bet is to return the a random valid token and skew the dataset somewhat as a result.
|
||||
if self.debug_failures:
|
||||
print(f"error loading {path}: ranges are out of bounds; {wav.shape[-1]}, {tseq.shape[0]}")
|
||||
rv = random.randint(0,len(self)-1)
|
||||
print(
|
||||
f"error loading {path}: ranges are out of bounds; {wav.shape[-1]}, {tseq.shape[0]}")
|
||||
rv = random.randint(0, len(self)-1)
|
||||
return self[rv]
|
||||
orig_output = wav.shape[-1]
|
||||
orig_text_len = tseq.shape[0]
|
||||
|
@ -192,7 +207,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
if wav.shape[-1] != self.max_wav_len:
|
||||
wav = F.pad(wav, (0, self.max_wav_len - wav.shape[-1]))
|
||||
# These codes are aligned to audio inputs, so make sure to pad them as well.
|
||||
aligned_codes = F.pad(aligned_codes, (0, self.max_aligned_codes-aligned_codes.shape[0]))
|
||||
aligned_codes = F.pad(
|
||||
aligned_codes, (0, self.max_aligned_codes-aligned_codes.shape[0]))
|
||||
if tseq.shape[0] != self.max_text_len:
|
||||
tseq = F.pad(tseq, (0, self.max_text_len - tseq.shape[0]))
|
||||
|
||||
|
@ -223,7 +239,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
return res
|
||||
|
||||
def __len__(self):
|
||||
return self.total_size_bytes // 1000 # 1000 cuts down a TSV file to the actual length pretty well.
|
||||
# 1000 cuts down a TSV file to the actual length pretty well.
|
||||
return self.total_size_bytes // 1000
|
||||
|
||||
|
||||
class FastPairedVoiceDebugger:
|
||||
|
@ -243,7 +260,8 @@ class FastPairedVoiceDebugger:
|
|||
if isinstance(state, dict):
|
||||
self.total_items = opt_get(state, ['total_items'], 0)
|
||||
self.loaded_items = opt_get(state, ['loaded_items'], 0)
|
||||
self.self_conditioning_items = opt_get(state, ['self_conditioning_items'], 0)
|
||||
self.self_conditioning_items = opt_get(
|
||||
state, ['self_conditioning_items'], 0)
|
||||
|
||||
def update(self, batch):
|
||||
self.total_items += batch['wav'].shape[0]
|
||||
|
@ -252,7 +270,8 @@ class FastPairedVoiceDebugger:
|
|||
for filename in batch['filenames']:
|
||||
self.unique_files.add(hashlib.sha256(filename.encode('utf-8')))
|
||||
if 'conditioning' in batch.keys():
|
||||
self.self_conditioning_items += batch['conditioning_contains_self'].sum().item()
|
||||
self.self_conditioning_items += batch['conditioning_contains_self'].sum(
|
||||
).item()
|
||||
|
||||
def get_debugging_map(self):
|
||||
return {
|
||||
|
@ -269,13 +288,13 @@ if __name__ == '__main__':
|
|||
params = {
|
||||
'mode': 'fast_paired_voice_audio',
|
||||
'path': ['y:/libritts/train-other-500/transcribed-oco.tsv',
|
||||
'y:/libritts/train-clean-100/transcribed-oco.tsv',
|
||||
'y:/libritts/train-clean-360/transcribed-oco.tsv',
|
||||
'y:/clips/books1/transcribed-oco.tsv',
|
||||
'y:/clips/books2/transcribed-oco.tsv',
|
||||
'y:/bigasr_dataset/hifi_tts/transcribed-oco.tsv',
|
||||
'y:/clips/podcasts-1/transcribed-oco.tsv',],
|
||||
'types': [0,1,1,1,2,2,0],
|
||||
'y:/libritts/train-clean-100/transcribed-oco.tsv',
|
||||
'y:/libritts/train-clean-360/transcribed-oco.tsv',
|
||||
'y:/clips/books1/transcribed-oco.tsv',
|
||||
'y:/clips/books2/transcribed-oco.tsv',
|
||||
'y:/bigasr_dataset/hifi_tts/transcribed-oco.tsv',
|
||||
'y:/clips/podcasts-1/transcribed-oco.tsv',],
|
||||
'types': [0, 1, 1, 1, 2, 2, 0],
|
||||
'phase': 'train',
|
||||
'n_workers': 0,
|
||||
'batch_size': batch_sz,
|
||||
|
@ -289,11 +308,12 @@ if __name__ == '__main__':
|
|||
'load_aligned_codes': True,
|
||||
'produce_ctc_metadata': True,
|
||||
}
|
||||
from data import create_dataset, create_dataloader
|
||||
from data import create_dataloader, create_dataset
|
||||
|
||||
def save(b, i, ib, key, c=None):
|
||||
if c is not None:
|
||||
torchaudio.save(f'{i}_clip_{ib}_{key}_{c}.wav', b[key][ib][c], 22050)
|
||||
torchaudio.save(f'{i}_clip_{ib}_{key}_{c}.wav',
|
||||
b[key][ib][c], 22050)
|
||||
else:
|
||||
torchaudio.save(f'{i}_clip_{ib}_{key}.wav', b[key][ib], 22050)
|
||||
|
||||
|
@ -304,8 +324,8 @@ if __name__ == '__main__':
|
|||
max_pads, max_repeats = 0, 0
|
||||
for i, b in tqdm(enumerate(dl)):
|
||||
for ib in range(batch_sz):
|
||||
#max_pads = max(max_pads, b['ctc_pads'].max())
|
||||
#max_repeats = max(max_repeats, b['ctc_repeats'].max())
|
||||
# max_pads = max(max_pads, b['ctc_pads'].max())
|
||||
# max_repeats = max(max_repeats, b['ctc_repeats'].max())
|
||||
print(f'{i} {ib} {b["real_text"][ib]}')
|
||||
save(b, i, ib, 'wav')
|
||||
save(b, i, ib, 'conditioning', 0)
|
||||
|
@ -314,4 +334,3 @@ if __name__ == '__main__':
|
|||
if i > 15:
|
||||
break
|
||||
print(max_pads, max_repeats)
|
||||
|
||||
|
|
|
@ -12,13 +12,15 @@ import torchaudio
|
|||
from tqdm import tqdm
|
||||
from transformers import Wav2Vec2Processor
|
||||
|
||||
from data.audio.paired_voice_audio_dataset import CharacterTokenizer
|
||||
from data.audio.unsupervised_audio_dataset import load_audio, load_similar_clips
|
||||
from utils.util import opt_get
|
||||
from dlas.data.audio.paired_voice_audio_dataset import CharacterTokenizer
|
||||
from dlas.data.audio.unsupervised_audio_dataset import (load_audio,
|
||||
load_similar_clips)
|
||||
from dlas.utils.util import opt_get
|
||||
|
||||
|
||||
def parse_tsv_aligned_codes(line, base_path):
|
||||
fpt = line.strip().split('\t')
|
||||
|
||||
def convert_string_list_to_tensor(strlist):
|
||||
if strlist.startswith('['):
|
||||
strlist = strlist[1:]
|
||||
|
@ -43,10 +45,12 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
|
||||
The upshot is that this dataset loads extremely quickly and consumes almost no system memory.
|
||||
"""
|
||||
|
||||
def __init__(self, hparams):
|
||||
self.paths = hparams['path']
|
||||
phoneme_paths = hparams['phoneme_paths']
|
||||
self.paths = [(p, False) for p in self.paths] + [(p, True) for p in phoneme_paths]
|
||||
self.paths = [(p, False) for p in self.paths] + [(p, True)
|
||||
for p in phoneme_paths]
|
||||
|
||||
self.paths_size_bytes = [os.path.getsize(p) for p, _ in self.paths]
|
||||
self.total_size_bytes = sum(self.paths_size_bytes)
|
||||
|
@ -54,28 +58,36 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
|
||||
self.normal_text_end_token = hparams['normal_text_end_token']
|
||||
self.load_conditioning = opt_get(hparams, ['load_conditioning'], False)
|
||||
self.conditioning_candidates = opt_get(hparams, ['num_conditioning_candidates'], 1)
|
||||
self.conditioning_length = opt_get(hparams, ['conditioning_length'], 44100)
|
||||
self.produce_ctc_metadata = opt_get(hparams, ['produce_ctc_metadata'], False)
|
||||
self.debug_failures = opt_get(hparams, ['debug_loading_failures'], False)
|
||||
self.conditioning_candidates = opt_get(
|
||||
hparams, ['num_conditioning_candidates'], 1)
|
||||
self.conditioning_length = opt_get(
|
||||
hparams, ['conditioning_length'], 44100)
|
||||
self.produce_ctc_metadata = opt_get(
|
||||
hparams, ['produce_ctc_metadata'], False)
|
||||
self.debug_failures = opt_get(
|
||||
hparams, ['debug_loading_failures'], False)
|
||||
self.text_cleaners = hparams.text_cleaners
|
||||
self.sample_rate = hparams.sample_rate
|
||||
self.aligned_codes_to_audio_ratio = 443 * self.sample_rate // 22050
|
||||
self.max_wav_len = opt_get(hparams, ['max_wav_length'], None)
|
||||
self.load_aligned_codes = opt_get(hparams, ['load_aligned_codes'], False)
|
||||
self.load_aligned_codes = opt_get(
|
||||
hparams, ['load_aligned_codes'], False)
|
||||
if self.max_wav_len is not None:
|
||||
self.max_aligned_codes = self.max_wav_len // self.aligned_codes_to_audio_ratio
|
||||
self.max_text_len = opt_get(hparams, ['max_text_length'], None)
|
||||
assert self.max_wav_len is not None and self.max_text_len is not None
|
||||
self.use_bpe_tokenizer = opt_get(hparams, ['use_bpe_tokenizer'], False)
|
||||
if self.use_bpe_tokenizer:
|
||||
from data.audio.voice_tokenizer import VoiceBpeTokenizer
|
||||
self.tokenizer = VoiceBpeTokenizer(opt_get(hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json'))
|
||||
from dlas.data.audio.voice_tokenizer import VoiceBpeTokenizer
|
||||
self.tokenizer = VoiceBpeTokenizer(opt_get(
|
||||
hparams, ['tokenizer_vocab'], '../experiments/bpe_lowercase_asr_256.json'))
|
||||
else:
|
||||
self.tokenizer = CharacterTokenizer()
|
||||
self.ipa_phoneme_tokenizer = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft").tokenizer
|
||||
self.ipa_phoneme_tokenizer = Wav2Vec2Processor.from_pretrained(
|
||||
"facebook/wav2vec2-lv-60-espeak-cv-ft").tokenizer
|
||||
self.ipa_phoneme_tokenizer.do_phonemize = False
|
||||
self.skipped_items = 0 # records how many items are skipped when accessing an index.
|
||||
# records how many items are skipped when accessing an index.
|
||||
self.skipped_items = 0
|
||||
|
||||
self.load_times = torch.zeros((256,))
|
||||
self.load_ind = 0
|
||||
|
@ -117,7 +129,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
try: # This can fail when seeking to a UTF-8 escape byte.
|
||||
f.readline()
|
||||
except:
|
||||
return self.load_random_line(depth=depth + 1) # On failure, just recurse and try again.
|
||||
# On failure, just recurse and try again.
|
||||
return self.load_random_line(depth=depth + 1)
|
||||
l2 = f.readline()
|
||||
|
||||
if l2:
|
||||
|
@ -126,14 +139,16 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
return parse_tsv_aligned_codes(l2, base_path), type, is_phonetic
|
||||
except:
|
||||
print(f"error parsing random offset: {sys.exc_info()}")
|
||||
return self.load_random_line(depth=depth+1) # On failure, just recurse and try again.
|
||||
# On failure, just recurse and try again.
|
||||
return self.load_random_line(depth=depth+1)
|
||||
|
||||
def get_ctc_metadata(self, codes):
|
||||
grouped = groupby(codes.tolist())
|
||||
rcodes, repeats, seps = [], [], [0]
|
||||
for val, group in grouped:
|
||||
if val == 0:
|
||||
seps[-1] = len(list(group)) # This is a very important distinction! It means the padding belongs to the character proceeding it.
|
||||
# This is a very important distinction! It means the padding belongs to the character proceeding it.
|
||||
seps[-1] = len(list(group))
|
||||
else:
|
||||
rcodes.append(val)
|
||||
repeats.append(len(list(group)))
|
||||
|
@ -149,7 +164,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
if rcodes.shape[0] < self.max_text_len:
|
||||
gap = self.max_text_len - rcodes.shape[0]
|
||||
rcodes = F.pad(rcodes, (0, gap))
|
||||
repeats = F.pad(repeats, (0, gap), value=1) # The minimum value for repeats is 1, hence this is the pad value too.
|
||||
# The minimum value for repeats is 1, hence this is the pad value too.
|
||||
repeats = F.pad(repeats, (0, gap), value=1)
|
||||
seps = F.pad(seps, (0, gap))
|
||||
elif rcodes.shape[0] > self.max_text_len:
|
||||
rcodes = rcodes[:self.max_text_len]
|
||||
|
@ -171,7 +187,7 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
if text is None or len(text.strip()) == 0:
|
||||
raise ValueError
|
||||
cond, cond_is_self = load_similar_clips(apt[0], self.conditioning_length, self.sample_rate,
|
||||
n=self.conditioning_candidates) if self.load_conditioning else (None, False)
|
||||
n=self.conditioning_candidates) if self.load_conditioning else (None, False)
|
||||
except:
|
||||
if self.skipped_items > 100:
|
||||
raise # Rethrow if we have nested too far.
|
||||
|
@ -185,12 +201,13 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
self.skipped_items = 0
|
||||
if wav is None or \
|
||||
(self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len) or \
|
||||
(self.max_text_len is not None and tseq.shape[0] > self.max_text_len):
|
||||
(self.max_text_len is not None and tseq.shape[0] > self.max_text_len):
|
||||
# Basically, this audio file is nonexistent or too long to be supported by the dataset.
|
||||
# It's hard to handle this situation properly. Best bet is to return the a random valid token and skew the dataset somewhat as a result.
|
||||
if self.debug_failures:
|
||||
print(f"error loading {path}: ranges are out of bounds; {wav.shape[-1]}, {tseq.shape[0]}")
|
||||
rv = random.randint(0,len(self)-1)
|
||||
print(
|
||||
f"error loading {path}: ranges are out of bounds; {wav.shape[-1]}, {tseq.shape[0]}")
|
||||
rv = random.randint(0, len(self)-1)
|
||||
return self[rv]
|
||||
|
||||
# Shift phonetic token and aligned_code tokens over.
|
||||
|
@ -206,7 +223,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
if wav.shape[-1] != self.max_wav_len:
|
||||
wav = F.pad(wav, (0, self.max_wav_len - wav.shape[-1]))
|
||||
# These codes are aligned to audio inputs, so make sure to pad them as well.
|
||||
aligned_codes = F.pad(aligned_codes, (0, self.max_aligned_codes-aligned_codes.shape[0]))
|
||||
aligned_codes = F.pad(
|
||||
aligned_codes, (0, self.max_aligned_codes-aligned_codes.shape[0]))
|
||||
if tseq.shape[0] != self.max_text_len:
|
||||
tseq = F.pad(tseq, (0, self.max_text_len - tseq.shape[0]))
|
||||
|
||||
|
@ -237,7 +255,8 @@ class FastPairedVoiceDataset(torch.utils.data.Dataset):
|
|||
return res
|
||||
|
||||
def __len__(self):
|
||||
return self.total_size_bytes // 1000 # 1000 cuts down a TSV file to the actual length pretty well.
|
||||
# 1000 cuts down a TSV file to the actual length pretty well.
|
||||
return self.total_size_bytes // 1000
|
||||
|
||||
|
||||
class FastPairedVoiceDebugger:
|
||||
|
@ -257,7 +276,8 @@ class FastPairedVoiceDebugger:
|
|||
if isinstance(state, dict):
|
||||
self.total_items = opt_get(state, ['total_items'], 0)
|
||||
self.loaded_items = opt_get(state, ['loaded_items'], 0)
|
||||
self.self_conditioning_items = opt_get(state, ['self_conditioning_items'], 0)
|
||||
self.self_conditioning_items = opt_get(
|
||||
state, ['self_conditioning_items'], 0)
|
||||
|
||||
def update(self, batch):
|
||||
self.total_items += batch['wav'].shape[0]
|
||||
|
@ -266,7 +286,8 @@ class FastPairedVoiceDebugger:
|
|||
for filename in batch['filenames']:
|
||||
self.unique_files.add(hashlib.sha256(filename.encode('utf-8')))
|
||||
if 'conditioning' in batch.keys():
|
||||
self.self_conditioning_items += batch['conditioning_contains_self'].sum().item()
|
||||
self.self_conditioning_items += batch['conditioning_contains_self'].sum(
|
||||
).item()
|
||||
|
||||
def get_debugging_map(self):
|
||||
return {
|
||||
|
@ -284,7 +305,7 @@ if __name__ == '__main__':
|
|||
'mode': 'fast_paired_voice_audio_with_phonemes',
|
||||
'path': ['y:/libritts/train-clean-100/transcribed-oco.tsv',],
|
||||
'phoneme_paths': ['y:/libritts/train-other-500/transcribed-phoneme-oco.tsv'],
|
||||
'types': [0,0],
|
||||
'types': [0, 0],
|
||||
'normal_text_end_token': 256,
|
||||
'phase': 'train',
|
||||
'n_workers': 0,
|
||||
|
@ -299,11 +320,12 @@ if __name__ == '__main__':
|
|||
'load_aligned_codes': False,
|
||||
'debug_loading_failures': True,
|
||||
}
|
||||
from data import create_dataset, create_dataloader
|
||||
from data import create_dataloader, create_dataset
|
||||
|
||||
def save(b, i, ib, key, c=None):
|
||||
if c is not None:
|
||||
torchaudio.save(f'{i}_clip_{ib}_{key}_{c}.wav', b[key][ib][c], 22050)
|
||||
torchaudio.save(f'{i}_clip_{ib}_{key}_{c}.wav',
|
||||
b[key][ib][c], 22050)
|
||||
else:
|
||||
torchaudio.save(f'{i}_clip_{ib}_{key}.wav', b[key][ib], 22050)
|
||||
|
||||
|
@ -314,14 +336,13 @@ if __name__ == '__main__':
|
|||
max_pads, max_repeats = 0, 0
|
||||
for i, b in tqdm(enumerate(dl)):
|
||||
for ib in range(batch_sz):
|
||||
#max_pads = max(max_pads, b['ctc_pads'].max())
|
||||
#max_repeats = max(max_repeats, b['ctc_repeats'].max())
|
||||
# max_pads = max(max_pads, b['ctc_pads'].max())
|
||||
# max_repeats = max(max_repeats, b['ctc_repeats'].max())
|
||||
print(f'{i} {ib} {b["real_text"][ib]}')
|
||||
#save(b, i, ib, 'wav')
|
||||
#save(b, i, ib, 'conditioning', 0)
|
||||
#save(b, i, ib, 'conditioning', 1)
|
||||
# save(b, i, ib, 'wav')
|
||||
# save(b, i, ib, 'conditioning', 0)
|
||||
# save(b, i, ib, 'conditioning', 1)
|
||||
pass
|
||||
if i > 15:
|
||||
break
|
||||
print(max_pads, max_repeats)
|
||||
|
||||
|
|
|
@ -6,9 +6,8 @@ import torch.utils.data
|
|||
from torch import LongTensor
|
||||
from tqdm import tqdm
|
||||
|
||||
from models.audio.tts.tacotron2 import load_filepaths_and_text
|
||||
from models.audio.tts.tacotron2 import symbols
|
||||
from models.audio.tts.tacotron2 import text_to_sequence
|
||||
from dlas.models.audio.tts.tacotron2 import (load_filepaths_and_text, symbols,
|
||||
text_to_sequence)
|
||||
|
||||
|
||||
class GptTtsDataset(torch.utils.data.Dataset):
|
||||
|
@ -21,7 +20,7 @@ class GptTtsDataset(torch.utils.data.Dataset):
|
|||
def __init__(self, opt):
|
||||
self.path = os.path.dirname(opt['path'])
|
||||
self.audiopaths_and_text = load_filepaths_and_text(opt['path'])
|
||||
self.text_cleaners=['english_cleaners']
|
||||
self.text_cleaners = ['english_cleaners']
|
||||
|
||||
self.MEL_DICTIONARY_SIZE = opt['mel_vocab_size']+3
|
||||
self.MEL_START_TOKEN = LongTensor([self.MEL_DICTIONARY_SIZE-3])
|
||||
|
@ -32,7 +31,8 @@ class GptTtsDataset(torch.utils.data.Dataset):
|
|||
audiopath_and_text = self.audiopaths_and_text[index]
|
||||
audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
|
||||
text = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
|
||||
text = torch.cat([self.TEXT_START_TOKEN, text, self.TEXT_STOP_TOKEN], dim=0)
|
||||
text = torch.cat([self.TEXT_START_TOKEN, text,
|
||||
self.TEXT_STOP_TOKEN], dim=0)
|
||||
|
||||
# Fetch quantized MELs
|
||||
quant_path = audiopath.replace('wavs/', 'quantized_mels/') + '.pth'
|
||||
|
@ -57,8 +57,9 @@ class GptTtsCollater():
|
|||
|
||||
def __call__(self, batch):
|
||||
text_lens = [len(x[0]) for x in batch]
|
||||
#max_text_len = max(text_lens)
|
||||
max_text_len = self.MAX_SYMBOLS_PER_PHRASE # This forces all outputs to have the full 200 characters. Testing if this makes a difference.
|
||||
# max_text_len = max(text_lens)
|
||||
# This forces all outputs to have the full 200 characters. Testing if this makes a difference.
|
||||
max_text_len = self.MAX_SYMBOLS_PER_PHRASE
|
||||
mel_lens = [len(x[1]) for x in batch]
|
||||
max_mel_len = max(mel_lens)
|
||||
texts = []
|
||||
|
@ -70,7 +71,8 @@ class GptTtsCollater():
|
|||
text = F.pad(text, (0, max_text_len-len(text)), value=0)
|
||||
text = torch.where(text == 0, text_range_embedding, text)
|
||||
texts.append(text)
|
||||
qmels.append(F.pad(qmel, (0, max_mel_len-len(qmel)), value=self.MEL_PAD_TOKEN))
|
||||
qmels.append(F.pad(qmel, (0, max_mel_len-len(qmel)),
|
||||
value=self.MEL_PAD_TOKEN))
|
||||
|
||||
filenames = [j[2] for j in batch]
|
||||
|
||||
|
@ -96,7 +98,7 @@ if __name__ == '__main__':
|
|||
'batch_size': 16,
|
||||
'mel_vocab_size': 512,
|
||||
}
|
||||
from data import create_dataset, create_dataloader
|
||||
from data import create_dataloader, create_dataset
|
||||
|
||||
ds, c = create_dataset(params, return_collate=True)
|
||||
dl = create_dataloader(ds, params, collate_fn=c)
|
||||
|
@ -107,5 +109,5 @@ if __name__ == '__main__':
|
|||
for b in tqdm(dl):
|
||||
max_mel = max(max_mel, b['padded_qmel'].shape[2])
|
||||
max_text = max(max_text, b['padded_text'].shape[1])
|
||||
m=torch.stack(m)
|
||||
m = torch.stack(m)
|
||||
print(m.mean(), m.std())
|
||||
|
|
|
@ -7,14 +7,15 @@ import torchaudio
|
|||
from munch import munchify
|
||||
from tqdm import tqdm
|
||||
|
||||
from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset
|
||||
from data.text.hf_datasets_wrapper import HfDataset
|
||||
from utils.util import opt_get
|
||||
from dlas.data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset
|
||||
from dlas.data.text.hf_datasets_wrapper import HfDataset
|
||||
from dlas.utils.util import opt_get
|
||||
|
||||
|
||||
def build_paired_voice_dataset(args):
|
||||
from data.audio.paired_voice_audio_dataset import TextWavLoader as D
|
||||
from models.audio.tts.tacotron2 import create_hparams
|
||||
|
||||
from dlas.data.audio.paired_voice_audio_dataset import TextWavLoader as D
|
||||
default_params = create_hparams()
|
||||
default_params.update(args)
|
||||
dataset_opt = munchify(default_params)
|
||||
|
@ -33,6 +34,7 @@ class GrandConjoinedDataset(torch.utils.data.Dataset):
|
|||
|
||||
Performs tokenization at this level, ignoring any tokenization performed by upstream datasets.
|
||||
"""
|
||||
|
||||
def __init__(self, opt):
|
||||
sample_rate = 22050 # Fixed.
|
||||
paired_dataset_args = opt['paired_dataset_args']
|
||||
|
@ -47,7 +49,8 @@ class GrandConjoinedDataset(torch.utils.data.Dataset):
|
|||
self.max_solo_text_length = opt['max_solo_text_length']
|
||||
self.collate = opt_get(opt, ['needs_collate'], False)
|
||||
self.sample_rate = sample_rate
|
||||
self.num_conditioning_candidates = opt_get(opt, ['num_conditioning_candidates'], 0)
|
||||
self.num_conditioning_candidates = opt_get(
|
||||
opt, ['num_conditioning_candidates'], 0)
|
||||
self.conditioning_length = opt_get(opt, ['conditioning_length'], 44000)
|
||||
load_conditioning = self.num_conditioning_candidates > 0
|
||||
|
||||
|
@ -75,7 +78,8 @@ class GrandConjoinedDataset(torch.utils.data.Dataset):
|
|||
def fetch_text_at(self, i):
|
||||
try:
|
||||
txt = self.text[i % len(self.text)]['text']
|
||||
assert '*' not in txt # This is a hack to get around the use of '*' to mask expletives in some text-only datasets. There really isn't a linguistic use for this character anyways.
|
||||
# This is a hack to get around the use of '*' to mask expletives in some text-only datasets. There really isn't a linguistic use for this character anyways.
|
||||
assert '*' not in txt
|
||||
tok = self.speech_and_text.get_text(txt)
|
||||
padding_required = self.max_solo_text_length - tok.shape[0]
|
||||
if padding_required < 0:
|
||||
|
@ -137,7 +141,8 @@ class GrandConjoinedDataset(torch.utils.data.Dataset):
|
|||
sp = self.speech[i % len(self.speech)]
|
||||
# Set upper bound on solo speech lengths. This is handled automatically when collation is turned off, but needs to be done otherwise.
|
||||
sp['clip'] = sp['clip'][:, :self.max_solo_audio_length]
|
||||
sp['clip_lengths'] = sp['clip_lengths'].clamp(0, self.max_solo_audio_length)
|
||||
sp['clip_lengths'] = sp['clip_lengths'].clamp(
|
||||
0, self.max_solo_audio_length)
|
||||
return self.optionally_add_conditioning_candidates({
|
||||
'paired_audio': snt['wav'],
|
||||
'paired_audio_lengths': snt['wav_lengths'],
|
||||
|
@ -205,7 +210,7 @@ if __name__ == '__main__':
|
|||
'use_bpe_tokenizer': False,
|
||||
},
|
||||
}
|
||||
from data import create_dataset, create_dataloader
|
||||
from data import create_dataloader, create_dataset
|
||||
os.remove('test_cache_delete_me2.pth')
|
||||
|
||||
ds, c = create_dataset(train_params, return_collate=True)
|
||||
|
@ -213,7 +218,8 @@ if __name__ == '__main__':
|
|||
|
||||
def save(b, i, ib, key, c=None):
|
||||
if c is not None:
|
||||
torchaudio.save(f'{i}_clip_{ib}_{key}_{c}.wav', b[key][ib][c], 22050)
|
||||
torchaudio.save(f'{i}_clip_{ib}_{key}_{c}.wav',
|
||||
b[key][ib][c], 22050)
|
||||
else:
|
||||
torchaudio.save(f'{i}_clip_{ib}_{key}.wav', b[key][ib], 22050)
|
||||
|
||||
|
@ -224,16 +230,17 @@ if __name__ == '__main__':
|
|||
m = None
|
||||
for i, b in tqdm(enumerate(dl)):
|
||||
for ib in range(batch_sz):
|
||||
#save(b, i, ib, 'paired_audio')
|
||||
#save(b, i, ib, 'paired_audio_conditioning', 0)
|
||||
#save(b, i, ib, 'paired_audio_conditioning', 1)
|
||||
print(f'Paired file: {b["paired_file"][ib]} text: {b["paired_text"][ib]}')
|
||||
print(f'Paired text decoded: {decode(b, ib, "paired_text_tokens")}')
|
||||
#save(b, i, ib, 'speech_audio')
|
||||
#save(b, i, ib, 'speech_audio_conditioning', 0)
|
||||
#save(b, i, ib, 'speech_audio_conditioning', 1)
|
||||
#print(f'Text: {b["text_text"][ib]}')
|
||||
#print(f'Text decoded: {decode(b, ib, "text_tokens")}')
|
||||
# save(b, i, ib, 'paired_audio')
|
||||
# save(b, i, ib, 'paired_audio_conditioning', 0)
|
||||
# save(b, i, ib, 'paired_audio_conditioning', 1)
|
||||
print(
|
||||
f'Paired file: {b["paired_file"][ib]} text: {b["paired_text"][ib]}')
|
||||
print(
|
||||
f'Paired text decoded: {decode(b, ib, "paired_text_tokens")}')
|
||||
# save(b, i, ib, 'speech_audio')
|
||||
# save(b, i, ib, 'speech_audio_conditioning', 0)
|
||||
# save(b, i, ib, 'speech_audio_conditioning', 1)
|
||||
# print(f'Text: {b["text_text"][ib]}')
|
||||
# print(f'Text decoded: {decode(b, ib, "text_tokens")}')
|
||||
if i > 5:
|
||||
break
|
||||
|
||||
|
|
|
@ -7,32 +7,36 @@ import torch.utils.data
|
|||
import torchaudio
|
||||
from tqdm import tqdm
|
||||
|
||||
from data.audio.unsupervised_audio_dataset import load_audio
|
||||
from data.util import find_files_of_type, is_audio_file
|
||||
from models.audio.tts.tacotron2 import load_filepaths_and_text
|
||||
from models.audio.tts.tacotron2 import text_to_sequence
|
||||
from utils.util import opt_get
|
||||
from dlas.data.audio.unsupervised_audio_dataset import load_audio
|
||||
from dlas.data.util import find_files_of_type, is_audio_file
|
||||
from dlas.models.audio.tts.tacotron2 import (load_filepaths_and_text,
|
||||
text_to_sequence)
|
||||
from dlas.utils.util import opt_get
|
||||
|
||||
|
||||
def load_tsv(filename):
|
||||
with open(filename, encoding='utf-8') as f:
|
||||
components = [line.strip().split('\t') for line in f]
|
||||
base = os.path.dirname(filename)
|
||||
filepaths_and_text = [[os.path.join(base, f'{component[1]}'), component[0]] for component in components]
|
||||
filepaths_and_text = [
|
||||
[os.path.join(base, f'{component[1]}'), component[0]] for component in components]
|
||||
return filepaths_and_text
|
||||
|
||||
|
||||
def load_mozilla_cv(filename):
|
||||
with open(filename, encoding='utf-8') as f:
|
||||
components = [line.strip().split('\t') for line in f][1:] # First line is the header
|
||||
components = [line.strip().split('\t')
|
||||
for line in f][1:] # First line is the header
|
||||
base = os.path.dirname(filename)
|
||||
filepaths_and_text = [[os.path.join(base, f'clips/{component[1]}'), component[2]] for component in components]
|
||||
filepaths_and_text = [[os.path.join(
|
||||
base, f'clips/{component[1]}'), component[2]] for component in components]
|
||||
return filepaths_and_text
|
||||
|
||||
|
||||
def load_voxpopuli(filename):
|
||||
with open(filename, encoding='utf-8') as f:
|
||||
lines = [line.strip().split('\t') for line in f][1:] # First line is the header
|
||||
lines = [line.strip().split('\t')
|
||||
for line in f][1:] # First line is the header
|
||||
base = os.path.dirname(filename)
|
||||
filepaths_and_text = []
|
||||
for line in lines:
|
||||
|
@ -40,7 +44,8 @@ def load_voxpopuli(filename):
|
|||
continue
|
||||
file, raw_text, norm_text, speaker_id, split, gender = line
|
||||
year = file[:4]
|
||||
filepaths_and_text.append([os.path.join(base, year, f'{file}.ogg.wav'), raw_text])
|
||||
filepaths_and_text.append(
|
||||
[os.path.join(base, year, f'{file}.ogg.wav'), raw_text])
|
||||
return filepaths_and_text
|
||||
|
||||
|
||||
|
@ -56,8 +61,10 @@ class TextWavLoader(torch.utils.data.Dataset):
|
|||
assert len(self.path) == len(fetcher_mode)
|
||||
|
||||
self.load_conditioning = opt_get(hparams, ['load_conditioning'], False)
|
||||
self.conditioning_candidates = opt_get(hparams, ['num_conditioning_candidates'], 3)
|
||||
self.conditioning_length = opt_get(hparams, ['conditioning_length'], 44100)
|
||||
self.conditioning_candidates = opt_get(
|
||||
hparams, ['num_conditioning_candidates'], 3)
|
||||
self.conditioning_length = opt_get(
|
||||
hparams, ['conditioning_length'], 44100)
|
||||
self.audiopaths_and_text = []
|
||||
for p, fm in zip(self.path, fetcher_mode):
|
||||
if fm == 'lj' or fm == 'libritts':
|
||||
|
@ -65,10 +72,12 @@ class TextWavLoader(torch.utils.data.Dataset):
|
|||
elif fm == 'tsv':
|
||||
fetcher_fn = load_tsv
|
||||
elif fm == 'mozilla_cv':
|
||||
assert not self.load_conditioning # Conditioning inputs are incompatible with mozilla_cv
|
||||
# Conditioning inputs are incompatible with mozilla_cv
|
||||
assert not self.load_conditioning
|
||||
fetcher_fn = load_mozilla_cv
|
||||
elif fm == 'voxpopuli':
|
||||
assert not self.load_conditioning # Conditioning inputs are incompatible with voxpopuli
|
||||