a bunch of shit to salvage my old encodec-quantized audio because dac-encoded audio just does not want to converge

This commit is contained in:
mrq 2024-05-12 10:17:29 -05:00
parent 917eeb40d2
commit 4f1593c8db
3 changed files with 187 additions and 17 deletions

View File

@ -5,15 +5,21 @@ import torchaudio
from tqdm.auto import tqdm
from pathlib import Path
from vall_e.config import cfg
from vall_e.emb.g2p import encode as valle_phonemize
from vall_e.emb.qnt import encode as valle_quantize, _replace_file_extension
# to-do: use argparser
# things that could be args
cfg.sample_rate = 24_000
cfg.inference.audio_backend = "encodec"
input_audio = "voices"
input_metadata = "training/metadata"
output_dataset = "training/data"
input_metadata = "./training/metadata"
output_dataset = f"./training/data-{'2' if cfg.sample_rate else '4'}4KHz-{cfg.inference.audio_backend}"
device = "cuda"
audio_extension = ".dac" if cfg.inference.audio_backend == "dac" else ".enc"
slice = "auto"
missing = {
"transcription": [],
@ -28,6 +34,9 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
if not os.path.isdir(f'./{input_audio}/{dataset_name}/'):
print("Is not dir:", f'./{input_audio}/{dataset_name}/')
continue
if dataset_name in ["LibriVox", "Audiobooks"]:
continue
for speaker_id in tqdm(sorted(os.listdir(f'./{input_audio}/{dataset_name}/')), desc=f"Processing speaker in {dataset_name}"):
if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'):
@ -35,6 +44,23 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
continue
os.makedirs(f'./{output_dataset}/{dataset_name}/{speaker_id}/', exist_ok=True)
if speaker_id == "Noise":
for filename in sorted(os.listdir(f'./{input_audio}/{dataset_name}/{speaker_id}/')):
inpath = Path(f'./{input_audio}/{dataset_name}/{speaker_id}/{filename}')
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{filename}')
if _replace_file_extension(outpath, audio_extension).exists():
continue
waveform, sample_rate = torchaudio.load(inpath)
qnt = valle_quantize(waveform, sr=sample_rate, device=device)
if cfg.inference.audio_backend == "dac":
qnt.save(_replace_file_extension(outpath, audio_extension))
else:
torch.save( qnt, _replace_file_extension(outpath, audio_extension) )
continue
metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/whisper.json')
if not metadata_path.exists():
@ -47,6 +73,9 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
missing["transcription"].append(str(metadata_path))
continue
if f'{dataset_name}/{speaker_id}' not in dataset:
dataset.append(f'{dataset_name}/{speaker_id}')
txts = []
wavs = []
@ -64,9 +93,6 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
waveform, sample_rate = None, None
language = metadata[filename]["language"] if "language" in metadata[filename] else "english"
if f'{dataset_name}/{speaker_id}' not in dataset:
dataset.append(f'{dataset_name}/{speaker_id}')
if len(metadata[filename]["segments"]) == 0 or not use_slices:
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}.{extension}')
text = metadata[filename]["text"]
@ -74,7 +100,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
if len(text) == 0:
continue
if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, ".dac").exists():
if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists():
continue
if not _replace_file_extension(outpath, ".json").exists():
@ -84,7 +110,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
language,
))
if not _replace_file_extension(outpath, ".dac").exists():
if not _replace_file_extension(outpath, audio_extension).exists():
if waveform is None:
waveform, sample_rate = torchaudio.load(inpath)
@ -100,7 +126,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
i = i + 1
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}')
if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, ".dac").exists():
if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists():
continue
if not _replace_file_extension(outpath, ".json").exists():
@ -110,7 +136,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
language,
))
if not _replace_file_extension(outpath, ".dac").exists():
if not _replace_file_extension(outpath, audio_extension).exists():
if waveform is None:
waveform, sample_rate = torchaudio.load(inpath)
@ -132,7 +158,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
))
if len(txts) > 0:
for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}"):
for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}", disable=True):
outpath, text, language = job
phones = valle_phonemize(text)
data = {
@ -147,10 +173,13 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
try:
outpath, waveform, sample_rate = job
qnt = valle_quantize(waveform, sr=sample_rate, device=device)
qnt.save(_replace_file_extension(outpath, ".dac"))
if cfg.inference.audio_backend == "dac":
qnt.save(_replace_file_extension(outpath, audio_extension))
else:
torch.save( qnt, _replace_file_extension(outpath, audio_extension) )
except Exception as e:
print(f"Failed to quantize: {outpath}:", e)
continue
open("./training/missing.json", 'w', encoding='utf-8').write(json.dumps(missing))
open("./training/dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset))
open("./missing.json", 'w', encoding='utf-8').write(json.dumps(missing))
open("./dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset))

View File

@ -145,7 +145,7 @@ def _get_hdf5_paths( data_dir, type="training", validate=False ):
return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
key = f"/{type}/{_get_hdf5_path(data_dir)}"
return [ Path(f"{key}/{child.attrs['id']}") for child in cfg.hdf5[key].values() if not validate or _validate(child) ] if key in cfg.hdf5 else []
return [ Path(f"{key}/{child}") for child in cfg.hdf5[key].keys() if not validate or _validate(child) ] if key in cfg.hdf5 else []
def _get_paths_of_extensions( path, extensions=_get_quant_extension(), validate=False ):
if isinstance(path, str):
@ -1003,6 +1003,145 @@ def create_dataset_hdf5( skip_existing=True ):
hf.create_dataset('symmap', data=json.dumps(symmap))
hf.close()
def extract_dataset_hdf5( skip_existing=True ):
cfg.dataset.use_hdf5 = True
cfg.load_hdf5(write=False)
hf = cfg.hdf5
symmap = get_phone_symmap()
reverse_symmap = {"1":"<s>","2":"</s>","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"","12":"","13":"d","14":"ɹ","15":"tˈ","16":"","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"","75":"kˈ","76":"ɔ","77":"zˈ","78":"","79":"","80":"vˈ","81":"","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"","86":"pˈ","87":"ðˌ","88":"","89":"","90":"","91":"̩","92":"ʔ","93":"","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"","101":"uːˈ","102":"iːˈ","103":"","104":".ˈ","105":"","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"","127":"ɫ","128":"q","129":"","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":"","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"","150":"","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"","180":"","181":"“ˈ","182":"“ˌ","183":";ˈ","184":"","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"","221":"eˈ","222":"ʍ","223":"","224":"","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"}
root = str(cfg.data_dir)
def add( dir, type="training", audios=True, texts=True ):
name = str(dir)
name = name.replace(root, "data/")
Path(f'{cfg.relpath}/{name}/').mkdir(parents=True, exist_ok=True)
if f'{type}/{name}' not in hf:
return
ids = [ key for key in hf[f'{type}/{name}'].keys() ]
for id in tqdm(ids, desc=f"Processing {name}"):
try:
key = f'{type}/{name}/{id}'
if key not in hf:
tqdm.write(f'Missing key: {key}')
continue
group = hf[key]
audio_exists = "audio" in group
text_exists = "text" in group
if not audio_exists or not text_exists:
tqdm.write(f'Missing audio/text: {key}')
continue
audio_path = Path(f'{cfg.relpath}/{name}/{id}.enc')
text_path = Path(f'{cfg.relpath}/{name}/{id}.json')
# audio
if audios and audio_exists and not audio_path.exists():
qnt = group["audio"][:, :]
torch.save( qnt, f'{cfg.relpath}/{name}/{id}.enc' )
# text
if texts and text_exists and not text_path.exists():
tokens = group["text"][:][1:-1]
phones = [ reverse_symmap[f'{token}'] for token in tokens ]
phones = list("".join(phones).replace(" ", " "))
j = {
"text": "",
"phonemes": phones,
"language": "en"
}
with open(text_path, "w", encoding="utf-8") as f:
f.write( json.dumps( j ) )
except Exception as e:
raise e
# training
for data_dir in tqdm(cfg.dataset.training, desc="Processing Training"):
add( data_dir, type="training" )
# validation
for data_dir in tqdm(cfg.dataset.validation, desc='Processing Validation'):
add( data_dir, type="validation" )
# noise
for data_dir in tqdm(cfg.dataset.noise, desc='Processing Noise'):
add( data_dir, type="noise", texts=False )
hf.close()
def retokenize_dataset_hdf5( skip_existing=True ):
cfg.dataset.use_hdf5 = True
cfg.load_hdf5(write=True)
hf = cfg.hdf5
symmap = get_phone_symmap()
reverse_symmap = {"1":"<s>","2":"</s>","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"","12":"","13":"d","14":"ɹ","15":"tˈ","16":"","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"","75":"kˈ","76":"ɔ","77":"zˈ","78":"","79":"","80":"vˈ","81":"","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"","86":"pˈ","87":"ðˌ","88":"","89":"","90":"","91":"̩","92":"ʔ","93":"","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"","101":"uːˈ","102":"iːˈ","103":"","104":".ˈ","105":"","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"","127":"ɫ","128":"q","129":"","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":"","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"","150":"","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"","180":"","181":"“ˈ","182":"“ˌ","183":";ˈ","184":"","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"","221":"eˈ","222":"ʍ","223":"","224":"","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"}
root = str(cfg.data_dir)
def add( dir, type="training" ):
name = str(dir)
name = name.replace(root, "data/")
Path(f'{cfg.relpath}/{name}/').mkdir(parents=True, exist_ok=True)
if f'{type}/{name}' not in hf:
return
ids = [ key for key in hf[f'{type}/{name}'].keys() ]
for id in tqdm(ids, desc=f"Processing {name}"):
try:
key = f'{type}/{name}/{id}'
if key not in hf:
tqdm.write(f'Missing key: {key}')
continue
group = hf[key]
if not "text" in group:
tqdm.write(f'Missing text: {key}')
continue
tokens = group["text"][:][1:-1]
content = list("".join([ reverse_symmap[f'{token}'] for token in tokens ]).replace(" ", " "))
tokens = cfg.tokenizer.encode("".join(content))
tokens = np.array(tokens).astype(np.uint8)
del group['text']
group.create_dataset('text', data=tokens, compression='lzf')
except Exception as e:
raise e
# training
for data_dir in tqdm(cfg.dataset.training, desc="Processing Training"):
add( data_dir, type="training" )
# validation
for data_dir in tqdm(cfg.dataset.validation, desc='Processing Validation'):
add( data_dir, type="validation" )
# write symmap
if "symmap" in hf:
del hf['symmap']
hf.create_dataset('symmap', data=json.dumps(symmap))
hf.close()
if __name__ == "__main__":
import argparse
@ -1023,6 +1162,10 @@ if __name__ == "__main__":
if args.action == "hdf5":
create_dataset_hdf5()
if args.action == "extract-hdf5":
extract_dataset_hdf5()
if args.action == "retokenize-hdf5":
retokenize_dataset_hdf5()
elif args.action == "metadata":
create_dataset_metadata()
elif args.action == "sample":

View File

@ -144,7 +144,6 @@ def _load_vocos_model(device="cuda", levels=cfg.model.max_levels):
@cache
def _load_dac_model(device="cuda", levels=cfg.model.max_levels):
kwargs = dict(model_type="24khz",model_bitrate="8kbps",tag="latest")
"""
if not cfg.variable_sample_rate:
# yes there's a better way, something like f'{cfg.sample.rate//1000}hz'
if cfg.sample_rate == 44_000:
@ -155,7 +154,6 @@ def _load_dac_model(device="cuda", levels=cfg.model.max_levels):
kwargs["model_type"] = "16khz"
else:
raise Exception(f'unsupported sample rate: {cfg.sample_rate}')
"""
model = __load_dac_model(**kwargs)
model = model.to(device)