DL-Art-School/codes/scripts/audio/preparation/gen_dvae_codes.py

import os

import torch
from tqdm import tqdm

from scripts.audio.gen.speech_synthesis_utils import load_speech_dvae, wav_to_mel

if __name__ == '__main__':
    input_folder = 'C:\\Users\\James\\Downloads\\lex2\\lexfridman_training_mp3'
    output_folder = 'C:\\Users\\James\\Downloads\\lex2\\quantized'

    params = {
        'mode': 'unsupervised_audio',
        'path': [input_folder],
        'cache_path': f'{input_folder}/cache.pth',
        'sampling_rate': 22050,
        'pad_to_samples': 441000,
        'resample_clip': False,
        'extra_samples': 0,
        'phase': 'train',
        'n_workers': 2,
        'batch_size': 64,
    }
    from data import create_dataset, create_dataloader
    os.makedirs(output_folder, exist_ok=True)

    ds = create_dataset(params)
    dl = create_dataloader(ds, params)

    dvae = load_speech_dvae().cuda()
    with torch.no_grad():
        for batch in tqdm(dl):
            audio = batch['clip'].cuda()
            mel = wav_to_mel(audio)
            codes = dvae.get_codebook_indices(mel)
            for i in range(audio.shape[0]):
                c = codes[i, :batch['clip_lengths'][i]//1024+4]  # +4 seems empirically to be a good clipping point - it seems to preserve the termination codes.
                fn = batch['path'][i]
                outp = os.path.join(output_folder, os.path.relpath(fn, input_folder) + ".pth")
                os.makedirs(os.path.dirname(outp), exist_ok=True)
                torch.save(c.tolist(), outp)
codes generation script 2022-06-03 17:02:28 +00:00			`import os`

			`import torch`
			`from tqdm import tqdm`

			`from scripts.audio.gen.speech_synthesis_utils import load_speech_dvae, wav_to_mel`

			`if __name__ == '__main__':`
			`input_folder = 'C:\\Users\\James\\Downloads\\lex2\\lexfridman_training_mp3'`
			`output_folder = 'C:\\Users\\James\\Downloads\\lex2\\quantized'`

			`params = {`
			`'mode': 'unsupervised_audio',`
			`'path': [input_folder],`
			`'cache_path': f'{input_folder}/cache.pth',`
			`'sampling_rate': 22050,`
			`'pad_to_samples': 441000,`
			`'resample_clip': False,`
			`'extra_samples': 0,`
			`'phase': 'train',`
			`'n_workers': 2,`
			`'batch_size': 64,`
			`}`
			`from data import create_dataset, create_dataloader`
			`os.makedirs(output_folder, exist_ok=True)`

			`ds = create_dataset(params)`
			`dl = create_dataloader(ds, params)`

			`dvae = load_speech_dvae().cuda()`
			`with torch.no_grad():`
			`for batch in tqdm(dl):`
			`audio = batch['clip'].cuda()`
			`mel = wav_to_mel(audio)`
			`codes = dvae.get_codebook_indices(mel)`
			`for i in range(audio.shape[0]):`
			`c = codes[i, :batch['clip_lengths'][i]//1024+4] # +4 seems empirically to be a good clipping point - it seems to preserve the termination codes.`
			`fn = batch['path'][i]`
			`outp = os.path.join(output_folder, os.path.relpath(fn, input_folder) + ".pth")`
			`os.makedirs(os.path.dirname(outp), exist_ok=True)`
			`torch.save(c.tolist(), outp)`