DL-Art-School/codes/scripts/audio/preparation/gen_dvae_codes.py
2022-06-03 11:02:28 -06:00

42 lines
1.5 KiB
Python

import os
import torch
from tqdm import tqdm
from scripts.audio.gen.speech_synthesis_utils import load_speech_dvae, wav_to_mel
if __name__ == '__main__':
input_folder = 'C:\\Users\\James\\Downloads\\lex2\\lexfridman_training_mp3'
output_folder = 'C:\\Users\\James\\Downloads\\lex2\\quantized'
params = {
'mode': 'unsupervised_audio',
'path': [input_folder],
'cache_path': f'{input_folder}/cache.pth',
'sampling_rate': 22050,
'pad_to_samples': 441000,
'resample_clip': False,
'extra_samples': 0,
'phase': 'train',
'n_workers': 2,
'batch_size': 64,
}
from data import create_dataset, create_dataloader
os.makedirs(output_folder, exist_ok=True)
ds = create_dataset(params)
dl = create_dataloader(ds, params)
dvae = load_speech_dvae().cuda()
with torch.no_grad():
for batch in tqdm(dl):
audio = batch['clip'].cuda()
mel = wav_to_mel(audio)
codes = dvae.get_codebook_indices(mel)
for i in range(audio.shape[0]):
c = codes[i, :batch['clip_lengths'][i]//1024+4] # +4 seems empirically to be a good clipping point - it seems to preserve the termination codes.
fn = batch['path'][i]
outp = os.path.join(output_folder, os.path.relpath(fn, input_folder) + ".pth")
os.makedirs(os.path.dirname(outp), exist_ok=True)
torch.save(c.tolist(), outp)