diff --git a/scripts/process_dataset.py b/scripts/process_dataset.py
index 52b2908..65036e4 100644
--- a/scripts/process_dataset.py
+++ b/scripts/process_dataset.py
@@ -5,15 +5,21 @@ import torchaudio
from tqdm.auto import tqdm
from pathlib import Path
+from vall_e.config import cfg
from vall_e.emb.g2p import encode as valle_phonemize
from vall_e.emb.qnt import encode as valle_quantize, _replace_file_extension
-# to-do: use argparser
+# things that could be args
+cfg.sample_rate = 24_000
+cfg.inference.audio_backend = "encodec"
+
input_audio = "voices"
-input_metadata = "training/metadata"
-output_dataset = "training/data"
+input_metadata = "./training/metadata"
+output_dataset = f"./training/data-{'2' if cfg.sample_rate else '4'}4KHz-{cfg.inference.audio_backend}"
device = "cuda"
+audio_extension = ".dac" if cfg.inference.audio_backend == "dac" else ".enc"
+
slice = "auto"
missing = {
"transcription": [],
@@ -28,6 +34,9 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
if not os.path.isdir(f'./{input_audio}/{dataset_name}/'):
print("Is not dir:", f'./{input_audio}/{dataset_name}/')
continue
+
+ if dataset_name in ["LibriVox", "Audiobooks"]:
+ continue
for speaker_id in tqdm(sorted(os.listdir(f'./{input_audio}/{dataset_name}/')), desc=f"Processing speaker in {dataset_name}"):
if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'):
@@ -35,6 +44,23 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
continue
os.makedirs(f'./{output_dataset}/{dataset_name}/{speaker_id}/', exist_ok=True)
+
+ if speaker_id == "Noise":
+ for filename in sorted(os.listdir(f'./{input_audio}/{dataset_name}/{speaker_id}/')):
+ inpath = Path(f'./{input_audio}/{dataset_name}/{speaker_id}/{filename}')
+ outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{filename}')
+
+ if _replace_file_extension(outpath, audio_extension).exists():
+ continue
+
+ waveform, sample_rate = torchaudio.load(inpath)
+ qnt = valle_quantize(waveform, sr=sample_rate, device=device)
+ if cfg.inference.audio_backend == "dac":
+ qnt.save(_replace_file_extension(outpath, audio_extension))
+ else:
+ torch.save( qnt, _replace_file_extension(outpath, audio_extension) )
+
+ continue
metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/whisper.json')
if not metadata_path.exists():
@@ -47,6 +73,9 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
missing["transcription"].append(str(metadata_path))
continue
+ if f'{dataset_name}/{speaker_id}' not in dataset:
+ dataset.append(f'{dataset_name}/{speaker_id}')
+
txts = []
wavs = []
@@ -64,9 +93,6 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
waveform, sample_rate = None, None
language = metadata[filename]["language"] if "language" in metadata[filename] else "english"
- if f'{dataset_name}/{speaker_id}' not in dataset:
- dataset.append(f'{dataset_name}/{speaker_id}')
-
if len(metadata[filename]["segments"]) == 0 or not use_slices:
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}.{extension}')
text = metadata[filename]["text"]
@@ -74,7 +100,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
if len(text) == 0:
continue
- if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, ".dac").exists():
+ if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists():
continue
if not _replace_file_extension(outpath, ".json").exists():
@@ -84,7 +110,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
language,
))
- if not _replace_file_extension(outpath, ".dac").exists():
+ if not _replace_file_extension(outpath, audio_extension).exists():
if waveform is None:
waveform, sample_rate = torchaudio.load(inpath)
@@ -100,7 +126,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
i = i + 1
outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}')
- if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, ".dac").exists():
+ if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists():
continue
if not _replace_file_extension(outpath, ".json").exists():
@@ -110,7 +136,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
language,
))
- if not _replace_file_extension(outpath, ".dac").exists():
+ if not _replace_file_extension(outpath, audio_extension).exists():
if waveform is None:
waveform, sample_rate = torchaudio.load(inpath)
@@ -132,7 +158,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
))
if len(txts) > 0:
- for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}"):
+ for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}", disable=True):
outpath, text, language = job
phones = valle_phonemize(text)
data = {
@@ -147,10 +173,13 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
try:
outpath, waveform, sample_rate = job
qnt = valle_quantize(waveform, sr=sample_rate, device=device)
- qnt.save(_replace_file_extension(outpath, ".dac"))
+ if cfg.inference.audio_backend == "dac":
+ qnt.save(_replace_file_extension(outpath, audio_extension))
+ else:
+ torch.save( qnt, _replace_file_extension(outpath, audio_extension) )
except Exception as e:
print(f"Failed to quantize: {outpath}:", e)
continue
-open("./training/missing.json", 'w', encoding='utf-8').write(json.dumps(missing))
-open("./training/dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset))
\ No newline at end of file
+open("./missing.json", 'w', encoding='utf-8').write(json.dumps(missing))
+open("./dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset))
diff --git a/vall_e/data.py b/vall_e/data.py
index 8f7a229..80585db 100755
--- a/vall_e/data.py
+++ b/vall_e/data.py
@@ -145,7 +145,7 @@ def _get_hdf5_paths( data_dir, type="training", validate=False ):
return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones
key = f"/{type}/{_get_hdf5_path(data_dir)}"
- return [ Path(f"{key}/{child.attrs['id']}") for child in cfg.hdf5[key].values() if not validate or _validate(child) ] if key in cfg.hdf5 else []
+ return [ Path(f"{key}/{child}") for child in cfg.hdf5[key].keys() if not validate or _validate(child) ] if key in cfg.hdf5 else []
def _get_paths_of_extensions( path, extensions=_get_quant_extension(), validate=False ):
if isinstance(path, str):
@@ -1003,6 +1003,145 @@ def create_dataset_hdf5( skip_existing=True ):
hf.create_dataset('symmap', data=json.dumps(symmap))
hf.close()
+def extract_dataset_hdf5( skip_existing=True ):
+ cfg.dataset.use_hdf5 = True
+ cfg.load_hdf5(write=False)
+ hf = cfg.hdf5
+
+ symmap = get_phone_symmap()
+
+ reverse_symmap = {"1":"","2":"","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"dˌ","12":"mˌ","13":"d","14":"ɹ","15":"tˈ","16":"pˌ","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"wˌ","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"hˌ","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"bˌ","75":"kˈ","76":"ɔ","77":"zˈ","78":"ᵻ","79":"kˌ","80":"vˈ","81":"fˌ","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"tˌ","86":"pˈ","87":"ðˌ","88":"sˌ","89":"nˌ","90":"lˌ","91":"̩","92":"ʔ","93":"vˌ","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"jˌ","101":"uːˈ","102":"iːˈ","103":"zˌ","104":".ˈ","105":"…","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"iˌ","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"-ˌ","127":"ɫ","128":"q","129":"—","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":".ˌ","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"?ˌ","150":",ˌ","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"!ˌ","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"qˌ","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"”","180":"“","181":"“ˈ","182":"“ˌ","183":";ˈ","184":";ˌ","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"ᵝ","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"oˌ","221":"eˈ","222":"ʍ","223":"eˌ","224":"uˌ","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"}
+
+ root = str(cfg.data_dir)
+
+ def add( dir, type="training", audios=True, texts=True ):
+ name = str(dir)
+ name = name.replace(root, "data/")
+
+ Path(f'{cfg.relpath}/{name}/').mkdir(parents=True, exist_ok=True)
+
+ if f'{type}/{name}' not in hf:
+ return
+
+ ids = [ key for key in hf[f'{type}/{name}'].keys() ]
+
+ for id in tqdm(ids, desc=f"Processing {name}"):
+ try:
+ key = f'{type}/{name}/{id}'
+
+ if key not in hf:
+ tqdm.write(f'Missing key: {key}')
+ continue
+
+ group = hf[key]
+ audio_exists = "audio" in group
+ text_exists = "text" in group
+
+ if not audio_exists or not text_exists:
+ tqdm.write(f'Missing audio/text: {key}')
+ continue
+
+ audio_path = Path(f'{cfg.relpath}/{name}/{id}.enc')
+ text_path = Path(f'{cfg.relpath}/{name}/{id}.json')
+
+ # audio
+ if audios and audio_exists and not audio_path.exists():
+ qnt = group["audio"][:, :]
+ torch.save( qnt, f'{cfg.relpath}/{name}/{id}.enc' )
+
+ # text
+ if texts and text_exists and not text_path.exists():
+ tokens = group["text"][:][1:-1]
+ phones = [ reverse_symmap[f'{token}'] for token in tokens ]
+ phones = list("".join(phones).replace(" ", " "))
+
+ j = {
+ "text": "",
+ "phonemes": phones,
+ "language": "en"
+ }
+
+ with open(text_path, "w", encoding="utf-8") as f:
+ f.write( json.dumps( j ) )
+
+ except Exception as e:
+ raise e
+
+ # training
+ for data_dir in tqdm(cfg.dataset.training, desc="Processing Training"):
+ add( data_dir, type="training" )
+
+ # validation
+ for data_dir in tqdm(cfg.dataset.validation, desc='Processing Validation'):
+ add( data_dir, type="validation" )
+
+ # noise
+ for data_dir in tqdm(cfg.dataset.noise, desc='Processing Noise'):
+ add( data_dir, type="noise", texts=False )
+
+ hf.close()
+
+def retokenize_dataset_hdf5( skip_existing=True ):
+ cfg.dataset.use_hdf5 = True
+ cfg.load_hdf5(write=True)
+ hf = cfg.hdf5
+
+ symmap = get_phone_symmap()
+ reverse_symmap = {"1":"","2":"","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"dˌ","12":"mˌ","13":"d","14":"ɹ","15":"tˈ","16":"pˌ","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"wˌ","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"hˌ","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"bˌ","75":"kˈ","76":"ɔ","77":"zˈ","78":"ᵻ","79":"kˌ","80":"vˈ","81":"fˌ","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"tˌ","86":"pˈ","87":"ðˌ","88":"sˌ","89":"nˌ","90":"lˌ","91":"̩","92":"ʔ","93":"vˌ","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"jˌ","101":"uːˈ","102":"iːˈ","103":"zˌ","104":".ˈ","105":"…","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"iˌ","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"-ˌ","127":"ɫ","128":"q","129":"—","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":".ˌ","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"?ˌ","150":",ˌ","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"!ˌ","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"qˌ","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"”","180":"“","181":"“ˈ","182":"“ˌ","183":";ˈ","184":";ˌ","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"ᵝ","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"oˌ","221":"eˈ","222":"ʍ","223":"eˌ","224":"uˌ","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"}
+
+ root = str(cfg.data_dir)
+
+ def add( dir, type="training" ):
+ name = str(dir)
+ name = name.replace(root, "data/")
+
+ Path(f'{cfg.relpath}/{name}/').mkdir(parents=True, exist_ok=True)
+
+ if f'{type}/{name}' not in hf:
+ return
+
+ ids = [ key for key in hf[f'{type}/{name}'].keys() ]
+
+ for id in tqdm(ids, desc=f"Processing {name}"):
+ try:
+ key = f'{type}/{name}/{id}'
+
+ if key not in hf:
+ tqdm.write(f'Missing key: {key}')
+ continue
+
+ group = hf[key]
+ if not "text" in group:
+ tqdm.write(f'Missing text: {key}')
+ continue
+
+ tokens = group["text"][:][1:-1]
+ content = list("".join([ reverse_symmap[f'{token}'] for token in tokens ]).replace(" ", " "))
+
+ tokens = cfg.tokenizer.encode("".join(content))
+ tokens = np.array(tokens).astype(np.uint8)
+
+ del group['text']
+ group.create_dataset('text', data=tokens, compression='lzf')
+
+ except Exception as e:
+ raise e
+
+ # training
+ for data_dir in tqdm(cfg.dataset.training, desc="Processing Training"):
+ add( data_dir, type="training" )
+
+ # validation
+ for data_dir in tqdm(cfg.dataset.validation, desc='Processing Validation'):
+ add( data_dir, type="validation" )
+
+ # write symmap
+ if "symmap" in hf:
+ del hf['symmap']
+
+ hf.create_dataset('symmap', data=json.dumps(symmap))
+ hf.close()
+
if __name__ == "__main__":
import argparse
@@ -1023,6 +1162,10 @@ if __name__ == "__main__":
if args.action == "hdf5":
create_dataset_hdf5()
+ if args.action == "extract-hdf5":
+ extract_dataset_hdf5()
+ if args.action == "retokenize-hdf5":
+ retokenize_dataset_hdf5()
elif args.action == "metadata":
create_dataset_metadata()
elif args.action == "sample":
diff --git a/vall_e/emb/qnt.py b/vall_e/emb/qnt.py
index ea4531b..b222f89 100755
--- a/vall_e/emb/qnt.py
+++ b/vall_e/emb/qnt.py
@@ -144,7 +144,6 @@ def _load_vocos_model(device="cuda", levels=cfg.model.max_levels):
@cache
def _load_dac_model(device="cuda", levels=cfg.model.max_levels):
kwargs = dict(model_type="24khz",model_bitrate="8kbps",tag="latest")
- """
if not cfg.variable_sample_rate:
# yes there's a better way, something like f'{cfg.sample.rate//1000}hz'
if cfg.sample_rate == 44_000:
@@ -155,7 +154,6 @@ def _load_dac_model(device="cuda", levels=cfg.model.max_levels):
kwargs["model_type"] = "16khz"
else:
raise Exception(f'unsupported sample rate: {cfg.sample_rate}')
- """
model = __load_dac_model(**kwargs)
model = model.to(device)