diff --git a/data/tokenizer.json b/data/tokenizer.json index f034fc9..71f0582 100644 --- a/data/tokenizer.json +++ b/data/tokenizer.json @@ -41,9 +41,7 @@ } ], "normalizer": null, - "pre_tokenizer": { - "type": "Whitespace" - }, + "pre_tokenizer": null, "post_processor": { "type": "TemplateProcessing", "single": [ @@ -110,263 +108,264 @@ "end_of_word_suffix": null, "fuse_unk": false, "byte_fallback": false, + "ignore_merges": false, "vocab": { "": 0, "": 1, "": 2, "": 3, - "!": 4, - "\"": 5, - "(": 6, - ")": 7, - ",": 8, - "-": 9, - ".": 10, - "1": 11, - ":": 12, - "?": 13, - "a": 14, - "b": 15, - "c": 16, - "d": 17, - "e": 18, - "f": 19, - "h": 20, - "i": 21, - "j": 22, - "k": 23, - "l": 24, - "m": 25, - "n": 26, - "o": 27, - "p": 28, - "q": 29, - "r": 30, - "s": 31, - "t": 32, - "u": 33, - "v": 34, - "w": 35, - "x": 36, - "z": 37, - "¡": 38, - "«": 39, - "»": 40, - "¿": 41, - "æ": 42, - "ç": 43, - "ð": 44, - "ŋ": 45, - "ɐ": 46, - "ɑ": 47, - "ɔ": 48, - "ɕ": 49, - "ə": 50, - "ɚ": 51, - "ɛ": 52, - "ɜ": 53, - "ɟ": 54, - "ɡ": 55, - "ɪ": 56, - "ɬ": 57, - "ɯ": 58, - "ɹ": 59, - "ɾ": 60, - "ʃ": 61, - "ʈ": 62, - "ʊ": 63, - "ʋ": 64, - "ʌ": 65, - "ʑ": 66, - "ʒ": 67, - "ʔ": 68, - "ʲ": 69, - "ˈ": 70, - "ˌ": 71, - "ː": 72, - "̃": 73, - "̩": 74, - "θ": 75, - "ᵻ": 76, - "—": 77, - "…": 78, - "ˈɛ": 79, - "iː": 80, - "aɪ": 81, - "nd": 82, - "ˈɪ": 83, - "eɪ": 84, - "ˈæ": 85, - "oʊ": 86, - "ðə": 87, - "ɑː": 88, - "ˈeɪ": 89, - "ən": 90, - "uː": 91, - "ˈʌ": 92, - "ˈaɪ": 93, - "st": 94, - "ˈɔ": 95, - "ˈoʊ": 96, - "ˈiː": 97, - "ˈɑː": 98, - "ænd": 99, - "ːɹ": 100, - "ɪŋ": 101, - "ɜː": 102, - "ɪn": 103, - "tə": 104, - "ʌv": 105, - "aʊ": 106, - "əl": 107, - "ˈuː": 108, - "tʃ": 109, - "ɪz": 110, - "ˈɜː": 111, - "ˌʌ": 112, - "æt": 113, - "dʒ": 114, - "ˈɔː": 115, - "ɪt": 116, - "ˈaʊ": 117, - "ɚɹ": 118, - "ˈɛn": 119, - "wʌ": 120, - "li": 121, - "hiː": 122, - "ˌɛ": 123, - "wɪ": 124, - "ðæt": 125, - "wʌz": 126, - "juː": 127, - "oːɹ": 128, - "ðɪ": 129, - "sˈɛ": 130, - "ˈɑːɹ": 131, - "ˌɪ": 132, - "nt": 133, - "ˈʊ": 134, - "ənt": 135, - "hɪz": 136, - "hæ": 137, - "ˌɑː": 138, - "ɔːɹ": 139, - "ˈɛɹ": 140, - "wɪð": 141, - "ᵻd": 142, - "ˈoːɹ": 143, - "pɹ": 144, - "ˈɔːl": 145, - "mˌ": 146, - "ʃən": 147, - "kt": 148, - "ˌoʊ": 149, - "ˈɔːɹ": 150, - "fɹ": 151, - "æz": 152, - "ʃiː": 153, - "ˌʌt": 154, - "ˈɛl": 155, - "ˌaʊ": 156, - "ˈʌn": 157, - "əs": 158, - "hɜː": 159, - "lˈaɪ": 160, - "ˈæn": 161, - "ˈɪɹ": 162, - "ʊd": 163, - "ɹᵻ": 164, - "ld": 165, - "bˌʌt": 166, - "ks": 167, - "nˈoʊ": 168, - "ɾɚ": 169, - "hæd": 170, - "ɛɹ": 171, - "ˈɪŋ": 172, - "ɡɹ": 173, - "ɔn": 174, - "nˌɑː": 175, - "maɪ": 176, - "vɚ": 177, - "fɔːɹ": 178, - "ðɚ": 179, - "tʊ": 180, - "ðɛɹ": 181, - "ˈʌm": 182, - "nˌɑːt": 183, - "tɹ": 184, - "sˈiː": 185, - "ʌvðə": 186, - "mˈɪ": 187, - "ˈæp": 188, - "ˌɪm": 189, - "ɪk": 190, - "sp": 191, - "lˈeɪ": 192, - "hˌɪm": 193, - "ɐn": 194, - "ðeɪ": 195, - "lˈɪ": 196, - "ɾi": 197, - "bɹ": 198, - "lˈɛ": 199, - "kɹ": 200, - "ˈɪl": 201, - "jˈuː": 202, - "ʌm": 203, - "mˌiː": 204, + " ": 4, + "!": 5, + "\"": 6, + "(": 7, + ")": 8, + ",": 9, + "-": 10, + ".": 11, + "1": 12, + ":": 13, + ";": 14, + "?": 15, + "a": 16, + "b": 17, + "c": 18, + "d": 19, + "e": 20, + "f": 21, + "h": 22, + "i": 23, + "j": 24, + "k": 25, + "l": 26, + "m": 27, + "n": 28, + "o": 29, + "p": 30, + "q": 31, + "r": 32, + "s": 33, + "t": 34, + "u": 35, + "v": 36, + "w": 37, + "x": 38, + "z": 39, + "¡": 40, + "«": 41, + "»": 42, + "¿": 43, + "æ": 44, + "ç": 45, + "ð": 46, + "ŋ": 47, + "ɐ": 48, + "ɑ": 49, + "ɔ": 50, + "ɕ": 51, + "ə": 52, + "ɚ": 53, + "ɛ": 54, + "ɜ": 55, + "ɟ": 56, + "ɡ": 57, + "ɪ": 58, + "ɬ": 59, + "ɯ": 60, + "ɹ": 61, + "ɾ": 62, + "ʃ": 63, + "ʈ": 64, + "ʊ": 65, + "ʋ": 66, + "ʌ": 67, + "ʑ": 68, + "ʒ": 69, + "ʔ": 70, + "ʲ": 71, + "ˈ": 72, + "ˌ": 73, + "ː": 74, + "̃": 75, + "̩": 76, + "θ": 77, + "ᵻ": 78, + "—": 79, + "“": 80, + "”": 81, + "…": 82, + "ˈɛ": 83, + "iː": 84, + "aɪ": 85, + "nd": 86, + "ˈɪ": 87, + "eɪ": 88, + "ˈæ": 89, + "ðə": 90, + "oʊ": 91, + "ɑː": 92, + "ˈeɪ": 93, + "ən": 94, + "uː": 95, + "ˈʌ": 96, + "ˈaɪ": 97, + "st": 98, + "ˈɔ": 99, + "ˈoʊ": 100, + "ˈiː": 101, + "ˈɑː": 102, + "ænd": 103, + "ːɹ": 104, + "ɪŋ": 105, + "ɜː": 106, + "ɪn": 107, + "tə": 108, + "ʌv": 109, + "aʊ": 110, + "əl": 111, + "ˈuː": 112, + "tʃ": 113, + "ɪz": 114, + "ˈɜː": 115, + "ˌʌ": 116, + "æt": 117, + "dʒ": 118, + "ˈɔː": 119, + "ɪt": 120, + "ˈaʊ": 121, + "ɚɹ": 122, + "ˈɛn": 123, + "wʌ": 124, + "li": 125, + "hiː": 126, + "ˌɛ": 127, + "wɪ": 128, + "wʌz": 129, + "ðæt": 130, + "juː": 131, + "oːɹ": 132, + "ðɪ": 133, + "sˈɛ": 134, + "ˌɪ": 135, + "ˈɑːɹ": 136, + "nt": 137, + "ˈʊ": 138, + "ənt": 139, + "hɪz": 140, + "ˌɑː": 141, + "hæ": 142, + "ɔːɹ": 143, + "ˈɛɹ": 144, + "wɪð": 145, + "ᵻd": 146, + "ˈoːɹ": 147, + "pɹ": 148, + "ˈɔːl": 149, + "mˌ": 150, + "ʃən": 151, + "kt": 152, + "ˌoʊ": 153, + "ˈɔːɹ": 154, + "fɹ": 155, + "æz": 156, + "ˌʌt": 157, + "ʃiː": 158, + "ˈɛl": 159, + "ˌaʊ": 160, + "ˈʌn": 161, + "əs": 162, + "hɜː": 163, + "lˈaɪ": 164, + "ˈæn": 165, + "ˈɪɹ": 166, + "ʊd": 167, + "ɹᵻ": 168, + "ld": 169, + "bˌʌt": 170, + "ks": 171, + "nˈoʊ": 172, + "hæd": 173, + "ɾɚ": 174, + "ɛɹ": 175, + "ˈɪŋ": 176, + "ɡɹ": 177, + "nˌɑː": 178, + "ɔn": 179, + "vɚ": 180, + "maɪ": 181, + "fɔːɹ": 182, + "ðɚ": 183, + "tʊ": 184, + "ðɛɹ": 185, + "nˌɑːt": 186, + "ˈʌm": 187, + "tɹ": 188, + "sˈiː": 189, + "ʌvðə": 190, + "mˈɪ": 191, + "hˈæ": 192, + "ˌɪm": 193, + "lˈeɪ": 194, + "ɪk": 195, + "sp": 196, + "hˌɪm": 197, + "ɐn": 198, + "ðeɪ": 199, + "lˈɪ": 200, + "ɾi": 201, + "lˈɛ": 202, + "bɹ": 203, + "kɹ": 204, "lˈæ": 205, - "ˌɪn": 206, - "bᵻ": 207, - "wˈʌn": 208, - "ˈɪn": 209, - "ˈoʊn": 210, - "biː": 211, - "sˈɛd": 212, - "ˈɛd": 213, - "ˈaɪt": 214, - "fɹʌm": 215, - "baɪ": 216, - "ɪs": 217, - "ɚz": 218, - "ðɪs": 219, - "əns": 220, - "ɪf": 221, - "bəl": 222, - "ˈænd": 223, - "ɪnðə": 224, - "əm": 225, - "iːz": 226, - "ˌuː": 227, - "ᵻz": 228, - "wˈeɪ": 229, - "ft": 230, - "wiː": 231, - "lˈiː": 232, - "stɹ": 233, - "jʊ": 234, - "ɚd": 235, - "ˌaɪ": 236, - "kw": 237, - "ˌɔn": 238, - "ˈaɪd": 239, - "ts": 240, - "ɪm": 241, - "ˈʌst": 242, - "ˈoʊld": 243, - "ˌɪtʃ": 244, - "dˈɪ": 245, - "sˌoʊ": 246, - "ɑːɹ": 247, - "hɐ": 248, - "sˈeɪ": 249, - "ɾᵻd": 250, - "dᵻ": 251, - "wˌɪtʃ": 252, - "sˈɛl": 253, - "ɹi": 254, - "ˈʌðɚ": 255 + "ˈɪl": 206, + "jˈuː": 207, + "ʌm": 208, + "mˌiː": 209, + "bᵻ": 210, + "wˈʌn": 211, + "ˌɪn": 212, + "ˈɪn": 213, + "ˈoʊn": 214, + "sˈɛd": 215, + "biː": 216, + "ˈɛd": 217, + "ˈaɪt": 218, + "baɪ": 219, + "fɹʌm": 220, + "ɪs": 221, + "ɚz": 222, + "ðɪs": 223, + "əns": 224, + "bəl": 225, + "ɪf": 226, + "ɪnðə": 227, + "əm": 228, + "ᵻz": 229, + "ˌuː": 230, + "wˈeɪ": 231, + "ft": 232, + "wiː": 233, + "stɹ": 234, + "lˈiː": 235, + "iːz": 236, + "pt": 237, + "jʊ": 238, + "ɚd": 239, + "ˌaɪ": 240, + "kw": 241, + "ˌɔn": 242, + "ˈaɪd": 243, + "ɪm": 244, + "ˈʌst": 245, + "ˈoʊld": 246, + "ts": 247, + "ˌɪtʃ": 248, + "sˌoʊ": 249, + "dˈɪ": 250, + "ɑːɹ": 251, + "hɐ": 252, + "sˈeɪ": 253, + "ɾᵻd": 254, + "wˌɪtʃ": 255 }, "merges": [ "ˈ ɛ", @@ -376,8 +375,8 @@ "ˈ ɪ", "e ɪ", "ˈ æ", - "o ʊ", "ð ə", + "o ʊ", "ɑ ː", "ˈ eɪ", "ə n", @@ -415,20 +414,20 @@ "h iː", "ˌ ɛ", "w ɪ", - "ð æt", "wʌ z", + "ð æt", "j uː", "o ːɹ", "ð ɪ", "s ˈɛ", - "ˈɑː ɹ", "ˌ ɪ", + "ˈɑː ɹ", "n t", "ˈ ʊ", "ən t", "h ɪz", - "h æ", "ˌ ɑː", + "h æ", "ɔ ːɹ", "ˈɛ ɹ", "wɪ ð", @@ -443,8 +442,8 @@ "ˈɔ ːɹ", "f ɹ", "æ z", - "ʃ iː", "ˌʌ t", + "ʃ iː", "ˈɛ l", "ˌ aʊ", "ˈʌ n", @@ -459,93 +458,89 @@ "b ˌʌt", "k s", "n ˈoʊ", - "ɾ ɚ", "hæ d", + "ɾ ɚ", "ɛ ɹ", "ˈɪ ŋ", "ɡ ɹ", - "ɔ n", "n ˌɑː", - "m aɪ", + "ɔ n", "v ɚ", + "m aɪ", "f ɔːɹ", "ð ɚ", "t ʊ", "ð ɛɹ", - "ˈʌ m", "nˌɑː t", + "ˈʌ m", "t ɹ", "s ˈiː", "ʌv ðə", "m ˈɪ", - "ˈæ p", + "h ˈæ", "ˌɪ m", + "l ˈeɪ", "ɪ k", "s p", - "l ˈeɪ", "h ˌɪm", "ɐ n", "ð eɪ", "l ˈɪ", "ɾ i", - "b ɹ", "l ˈɛ", + "b ɹ", "k ɹ", + "l ˈæ", "ˈɪ l", "j ˈuː", "ʌ m", "mˌ iː", - "l ˈæ", - "ˌ ɪn", "b ᵻ", "w ˈʌn", + "ˌ ɪn", "ˈɪ n", "ˈoʊ n", - "b iː", "sˈɛ d", + "b iː", "ˈɛ d", "ˈaɪ t", - "fɹ ʌm", "b aɪ", + "fɹ ʌm", "ɪ s", "ɚ z", "ðɪ s", "ən s", - "ɪ f", "b əl", - "ˈæ nd", + "ɪ f", "ɪn ðə", "ə m", - "iː z", - "ˌ uː", "ᵻ z", + "ˌ uː", "w ˈeɪ", "f t", "w iː", - "l ˈiː", "st ɹ", + "l ˈiː", + "iː z", + "p t", "j ʊ", "ɚ d", "ˌ aɪ", "k w", "ˌ ɔn", "ˈaɪ d", - "t s", "ɪ m", "ˈʌ st", "ˈoʊ ld", + "t s", "ˌɪ tʃ", - "d ˈɪ", "s ˌoʊ", + "d ˈɪ", "ɑː ɹ", "h ɐ", "s ˈeɪ", "ɾ ᵻd", - "d ᵻ", - "w ˌɪtʃ", - "sˈɛ l", - "ɹ i", - "ˈʌ ðɚ" + "w ˌɪtʃ" ] } } \ No newline at end of file diff --git a/scripts/process_dataset.py b/scripts/process_dataset.py index 1290160..9b43f30 100644 --- a/scripts/process_dataset.py +++ b/scripts/process_dataset.py @@ -6,16 +6,22 @@ import torchaudio from tqdm.auto import tqdm from pathlib import Path from vall_e.config import cfg + +# things that could be args +cfg.sample_rate = 44_000 +cfg.inference.audio_backend = "dac" +""" +cfg.inference.weight_dtype = "bfloat16" +cfg.inference.dtype = torch.bfloat16 +cfg.inference.amp = True +""" + from vall_e.emb.g2p import encode as valle_phonemize from vall_e.emb.qnt import encode as valle_quantize, _replace_file_extension -# things that could be args -cfg.sample_rate = 24_000 -cfg.inference.audio_backend = "encodec" - input_audio = "voices" -input_metadata = "./training/metadata" -output_dataset = f"./training/data-{'2' if cfg.sample_rate == 24_000 else '4'}4KHz-{cfg.inference.audio_backend}" +input_metadata = "metadata" +output_dataset = f"training-{'2' if cfg.sample_rate == 24_000 else '4'}4KHz-{cfg.inference.audio_backend}" device = "cuda" audio_extension = ".dac" if cfg.inference.audio_backend == "dac" else ".enc" @@ -34,9 +40,6 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): if not os.path.isdir(f'./{input_audio}/{dataset_name}/'): print("Is not dir:", f'./{input_audio}/{dataset_name}/') continue - - if dataset_name in ["LibriVox", "Audiobooks"]: - continue for speaker_id in tqdm(sorted(os.listdir(f'./{input_audio}/{dataset_name}/')), desc=f"Processing speaker in {dataset_name}"): if not os.path.isdir(f'./{input_audio}/{dataset_name}/{speaker_id}'): @@ -55,10 +58,29 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): waveform, sample_rate = torchaudio.load(inpath) qnt = valle_quantize(waveform, sr=sample_rate, device=device) + if cfg.inference.audio_backend == "dac": - qnt.save(_replace_file_extension(outpath, audio_extension)) + np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), { + "codes": qnt.codes.numpy().astype(np.uint16), + "metadata": { + "original_length": qnt.original_length, + "sample_rate": qnt.sample_rate, + + "input_db": qnt.input_db.numpy().astype(np.float32), + "chunk_length": qnt.chunk_length, + "channels": qnt.channels, + "padding": qnt.padding, + "dac_version": "1.0.0", + }, + }) else: - torch.save( qnt, _replace_file_extension(outpath, audio_extension) ) + np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), { + "codes": qnt.numpy().astype(np.uint16), + "metadata": { + "original_length": waveform.shape[-1], + "sample_rate": sample_rate, + }, + }) continue @@ -91,7 +113,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): fname = filename.replace(f'.{extension}', "") waveform, sample_rate = None, None - language = metadata[filename]["language"] if "language" in metadata[filename] else "english" + language = metadata[filename]["language"] if "language" in metadata[filename] else "en" if len(metadata[filename]["segments"]) == 0 or not use_slices: outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}.{extension}') @@ -100,86 +122,101 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')): if len(text) == 0: continue - if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists(): + if _replace_file_extension(outpath, audio_extension).exists(): continue - if not _replace_file_extension(outpath, ".json").exists(): - txts.append(( - outpath, - text, - language, - )) - - if not _replace_file_extension(outpath, audio_extension).exists(): - if waveform is None: - waveform, sample_rate = torchaudio.load(inpath) + if waveform is None: + waveform, sample_rate = torchaudio.load(inpath) + if waveform.shape[0] > 1: + waveform = torch.mean(waveform, dim=0, keepdim=True) - wavs.append(( - outpath, - waveform, - sample_rate - )) + wavs.append(( + outpath, + text, + language, + waveform, + sample_rate + )) else: i = 0 for segment in metadata[filename]["segments"]: id = pad(i, 4) i = i + 1 - outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}') - if _replace_file_extension(outpath, ".json").exists() and _replace_file_extension(outpath, audio_extension).exists(): + outpath = Path(f'./{output_dataset}/{dataset_name}/{speaker_id}/{fname}_{id}.{extension}') + text = metadata[filename]["text"] + + if len(text) == 0: continue - if not _replace_file_extension(outpath, ".json").exists(): - txts.append(( - outpath, - segment["text"], - language, - )) - - if not _replace_file_extension(outpath, audio_extension).exists(): - if waveform is None: - waveform, sample_rate = torchaudio.load(inpath) + if _replace_file_extension(outpath, audio_extension).exists(): + continue - start = int(segment['start'] * sample_rate) - end = int(segment['end'] * sample_rate) + if waveform is None: + waveform, sample_rate = torchaudio.load(inpath) + if waveform.shape[0] > 1: + waveform = torch.mean(waveform, dim=0, keepdim=True) - if start < 0: - start = 0 - if end >= waveform.shape[-1]: - end = waveform.shape[-1] - 1 + start = int(segment['start'] * sample_rate) + end = int(segment['end'] * sample_rate) - if end - start < 0: - continue + if start < 0: + start = 0 + if end >= waveform.shape[-1]: + end = waveform.shape[-1] - 1 - wavs.append(( - outpath, - waveform[:, start:end], - sample_rate - )) + if end - start < 0: + continue - if len(txts) > 0: - for job in tqdm(txts, desc=f"Phonemizing: {speaker_id}", disable=True): - outpath, text, language = job - phones = valle_phonemize(text) - data = { - "text": text.strip(), - "phonemes": phones, - "language": language, - } - open(_replace_file_extension(outpath, ".json"), 'w', encoding='utf-8').write(json.dumps(data)) + wavs.append(( + outpath, + text, + language, + waveform[:, start:end], + sample_rate + )) if len(wavs) > 0: for job in tqdm(wavs, desc=f"Quantizing: {speaker_id}"): try: - outpath, waveform, sample_rate = job + outpath, text, language, waveform, sample_rate = job + + phones = valle_phonemize(text) qnt = valle_quantize(waveform, sr=sample_rate, device=device) + if cfg.inference.audio_backend == "dac": - qnt.save(_replace_file_extension(outpath, audio_extension)) + np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), { + "codes": qnt.codes.numpy().astype(np.uint16), + "metadata": { + "original_length": qnt.original_length, + "sample_rate": qnt.sample_rate, + + "input_db": qnt.input_db.numpy().astype(np.float32), + "chunk_length": qnt.chunk_length, + "channels": qnt.channels, + "padding": qnt.padding, + "dac_version": "1.0.0", + + "text": text.strip(), + "phonemes": "".join(phones), + "language": language, + }, + }) else: - torch.save( qnt, _replace_file_extension(outpath, audio_extension) ) + np.save(open(_replace_file_extension(outpath, audio_extension), "wb"), { + "codes": qnt.numpy().astype(np.uint16), + "metadata": { + "original_length": waveform.shape[-1], + "sample_rate": sample_rate, + + "text": text.strip(), + "phonemes": "".join(phones), + "language": language, + }, + }) except Exception as e: print(f"Failed to quantize: {outpath}:", e) continue open("./missing.json", 'w', encoding='utf-8').write(json.dumps(missing)) -open("./dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset)) +open("./dataset_list.json", 'w', encoding='utf-8').write(json.dumps(dataset)) \ No newline at end of file diff --git a/scripts/train_tokenizer.py b/scripts/train_tokenizer.py index cc22430..57c0f95 100644 --- a/scripts/train_tokenizer.py +++ b/scripts/train_tokenizer.py @@ -38,20 +38,23 @@ else: metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/{id}') metadata = json.loads(open(metadata_path, "r", encoding="utf-8").read()) + if "phonemes" not in metadata: + continue + tokenizer_data.append( f'{"".join(metadata["phonemes"])}' ) open(output_file, 'w', encoding='utf-8').write(json.dumps(tokenizer_data)) unk_token = "" -spl_tokens = ["", "", unk_token, ""] +spl_tokens = [unk_token, "", "", "", ""] trainer = BpeTrainer(special_tokens = spl_tokens, vocab_size = 256) tokenizer = Tokenizer(BPE(unk_token = unk_token)) -tokenizer.pre_tokenizer = Whitespace() +tokenizer.pre_tokenizer = Whitespace() # takes 2 hours to process without this, we'll just manually add spaces as a token tokenizer.post_processor = TemplateProcessing( single=" $A ", special_tokens=[("", 1), ("", 2)], ) tokenizer.train_from_iterator(tokenizer_data, trainer=trainer) -tokenizer.save("./training/tokenizer.json") \ No newline at end of file +tokenizer.save("./training/tokenizer_training_data.json") \ No newline at end of file diff --git a/vall_e/data.py b/vall_e/data.py index adf8574..c242585 100755 --- a/vall_e/data.py +++ b/vall_e/data.py @@ -86,19 +86,15 @@ def _calculate_durations( type="training" ): def _load_paths(dataset, type="training"): return { cfg.get_spkr( cfg.data_dir / data_dir / "dummy" ): _load_paths_from_metadata( data_dir, type=type, validate=cfg.dataset.validate and type == "training" ) for data_dir in tqdm(dataset, desc=f"Parsing dataset: {type}") } -def _load_paths_from_metadata(dataset_name, type="training", validate=False): - data_dir = dataset_name if cfg.dataset.use_hdf5 else cfg.data_dir / dataset_name +def _load_paths_from_metadata(group_name, type="training", validate=False): + data_dir = group_name if cfg.dataset.use_hdf5 else cfg.data_dir / group_name _fn = _get_hdf5_paths if cfg.dataset.use_hdf5 else _get_paths_of_extensions - def key( id ): - if not cfg.dataset.use_hdf5: - return data_dir / id + def key( id, entry=None ): + return f"/{type}/{_get_hdf5_path(data_dir)}/{id}" if cfg.dataset.use_hdf5 else data_dir / id - return f"/{type}/{_get_hdf5_path(data_dir)}/{id}" - - - metadata_path = cfg.metadata_dir / f'{dataset_name}.json' + metadata_path = cfg.metadata_dir / f'{group_name}.json' metadata = {} if cfg.dataset.use_metadata and metadata_path.exists(): @@ -107,10 +103,7 @@ def _load_paths_from_metadata(dataset_name, type="training", validate=False): if len(metadata) == 0: return _fn( data_dir, type if cfg.dataset.use_hdf5 else _get_quant_extension(), validate ) - - def _validate( id ): - entry = metadata[id] - + def _validate( id, entry ): phones = entry['phones'] if "phones" in entry else 0 duration = entry['duration'] if "duration" in entry else 0 if type not in _total_durations: @@ -118,14 +111,16 @@ def _load_paths_from_metadata(dataset_name, type="training", validate=False): _total_durations[type] += duration + """ if cfg.dataset.use_hdf5: k = key( id ) if k not in cfg.hdf5 or "audio" not in cfg.hdf5[k] or "text" not in cfg.hdf5[k]: return False + """ - return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones + return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration #and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones - return [ key(id) for id in metadata.keys() if not validate or _validate(id) ] + return [ key(id, entry) for id, entry in metadata.items() if not validate or _validate(id, entry) ] def _get_hdf5_path(path): @@ -136,16 +131,16 @@ def _get_hdf5_path(path): def _get_hdf5_paths( data_dir, type="training", validate=False ): data_dir = str(data_dir) - def _validate( child ): - phones = child.attrs['phonemes'] - duration = child.attrs['duration'] + def _validate( id, entry ): + phones = entry.attrs['phonemes'] + duration = entry.attrs['duration'] if type not in _total_durations: _total_durations[type] = 0 - _total_durations[type] += child.attrs['duration'] - return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones + _total_durations[type] += entry.attrs['duration'] + return cfg.dataset.min_duration <= duration and duration <= cfg.dataset.max_duration #and cfg.dataset.min_phones <= phones and phones <= cfg.dataset.max_phones key = f"/{type}/{_get_hdf5_path(data_dir)}" - return [ Path(f"{key}/{child.attrs['id']}") for child in cfg.hdf5[key].values() if not validate or _validate(child) ] if key in cfg.hdf5 else [] + return [ Path(f"{key}/{id}") for id, entry in cfg.hdf5[key].items() if not validate or _validate(id, entry) ] if key in cfg.hdf5 else [] def _get_paths_of_extensions( path, extensions=_get_quant_extension(), validate=False ): if isinstance(path, str): @@ -807,47 +802,30 @@ def create_dataset_metadata( skip_existing=True ): if id not in metadata: metadata[id] = {} - # audio + utterance_metadata = {} if audios: - if _get_quant_extension() == ".dac": - dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()] - qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16) + # ideally we'll encode Encodec-based audio in a similar manner because np has smaller files than pt + dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()] + qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16) - duration = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"] - metadata[id]["metadata"] = { - "original_length": dac["metadata"]["original_length"], - "sample_rate": dac["metadata"]["sample_rate"], - } - else: - qnt = torch.load(f'{root}/{name}/{id}{_get_quant_extension()}')[0].t() - duration = qnt.shape[0] / cfg.dataset.frames_per_second - - metadata[id]["duration"] = duration - else: - metadata[id]["duration"] = 0 - + if "text" in dac["metadata"]: + utterance_metadata["text"] = dac["metadata"]["text"] + if "phonemes" in dac["metadata"]: + utterance_metadata["phonemes"] = dac["metadata"]["phonemes"] + if "language" in dac["metadata"]: + utterance_metadata["language"] = dac["metadata"]["language"] + if "original_length" in dac["metadata"] and "sample_rate" in dac["metadata"]: + utterance_metadata["duration"] = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"] # text if texts: - if _get_phone_extension() == ".json": - json_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read()) - content = json_metadata["phonemes"] - txt = json_metadata["text"] - lang = json_metadata["language"][:2] - else: - content = open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read().split(" ") - txt = "" - lang = "en" + if not utterance_metadata: + utterance_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read()) - phn = cfg.tokenizer.encode("".join(content)) - phn = np.array(phn).astype(np.uint8) + for k, v in utterance_metadata.items(): + metadata[id][k] = v - metadata[id]["phones"] = len(phn) - metadata[id]["transcription"] = txt - metadata[id]["language"] = lang except Exception as e: - #raise e - print(id, e) - #pass + tqdm.write(f'Error while processing {id}: {e}') with open(str(metadata_path), "w", encoding="utf-8") as f: f.write( json.dumps( metadata ) ) @@ -900,84 +878,68 @@ def create_dataset_hdf5( skip_existing=True ): for id in tqdm(ids, desc=f"Processing {name}"): try: - audio_exists = os.path.exists(f'{root}/{name}/{id}{_get_quant_extension()}') if audios else True - text_exists = os.path.exists(f'{root}/{name}/{id}{_get_phone_extension()}') if texts else True + audio_exists = os.path.exists(f'{root}/{name}/{id}{_get_quant_extension()}') + text_exists = os.path.exists(f'{root}/{name}/{id}{_get_phone_extension()}') if type != "Noise" else True - if not audio_exists or not text_exists: + if not audio_exists: continue key = f'{type}/{speaker_name}/{id}' + """ if skip_existing and key in hf: continue + """ group = hf.create_group(key) if key not in hf else hf[key] + """ group.attrs['id'] = id group.attrs['type'] = type group.attrs['speaker'] = speaker_name + """ if id not in metadata: metadata[id] = {} + utterance_metadata = {} + # audio if audios: - if _get_quant_extension() == ".dac": - dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()] - qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16) + # ideally we'll encode Encodec-based audio in a similar manner because np has smaller files than pt + dac = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()] + qnt = torch.from_numpy(dac["codes"].astype(int))[0].t().to(dtype=torch.int16) - duration = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"] - metadata[id]["metadata"] = { - "original_length": dac["metadata"]["original_length"], - "sample_rate": dac["metadata"]["sample_rate"], - } - else: - qnt = torch.load(f'{root}/{name}/{id}{_get_quant_extension()}')[0].t() - duration = qnt.shape[0] / cfg.dataset.frames_per_second - - qnt = qnt.numpy().astype(np.int16) + if "text" in dac["metadata"]: + utterance_metadata["text"] = dac["metadata"]["text"] + if "phonemes" in dac["metadata"]: + utterance_metadata["phonemes"] = dac["metadata"]["phonemes"] + if "language" in dac["metadata"]: + utterance_metadata["language"] = dac["metadata"]["language"] + if "original_length" in dac["metadata"] and "sample_rate" in dac["metadata"]: + utterance_metadata["duration"] = dac["metadata"]["original_length"] / dac["metadata"]["sample_rate"] if "audio" not in group: - group.create_dataset('audio', data=qnt, compression='lzf') + group.create_dataset('audio', data=qnt.numpy().astype(np.int16), compression='lzf') - group.attrs['duration'] = duration - metadata[id]["duration"] = duration - else: - group.attrs['duration'] = 0 - metadata[id]["duration"] = 0 - # text if texts: - if _get_phone_extension() == ".json": - json_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read()) - content = json_metadata["phonemes"] - txt = json_metadata["text"] - lang = json_metadata["language"][:2] - else: - content = open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read().split(" ") - txt = "" - lang = "en" + if not utterance_metadata and text_exists: + utterance_metadata = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read()) - phn = cfg.tokenizer.encode("".join(content)) + phn = "".join(utterance_metadata["phonemes"]) + phn = cfg.tokenizer.encode(phn) phn = np.array(phn).astype(np.uint8) if "text" not in group: group.create_dataset('text', data=phn, compression='lzf') - group.attrs['phonemes'] = len(phn) - group.attrs['transcription'] = txt - group.attrs['language'] = lang + for k, v in utterance_metadata.items(): + group.attrs[k] = v + metadata[id][k] = v - metadata[id]["phones"] = len(phn) - metadata[id]["transcription"] = txt - metadata[id]["language"] = lang - else: - group.attrs['phonemes'] = 0 - metadata[id]["phones"] = 0 except Exception as e: - #raise e - print(id, e) - #pass + tqdm.write(f'Error while processing {id}: {e}') with open(str(metadata_path), "w", encoding="utf-8") as f: f.write( json.dumps( metadata ) ) @@ -1002,119 +964,6 @@ def create_dataset_hdf5( skip_existing=True ): hf.create_dataset('symmap', data=json.dumps(symmap)) hf.close() -def extract_dataset_hdf5( skip_existing=True ): - cfg.dataset.use_hdf5 = True - cfg.load_hdf5(write=False) - hf = cfg.hdf5 - - symmap = get_phone_symmap() - - reverse_symmap = {"1":"","2":"","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"dˌ","12":"mˌ","13":"d","14":"ɹ","15":"tˈ","16":"pˌ","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"wˌ","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"hˌ","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"bˌ","75":"kˈ","76":"ɔ","77":"zˈ","78":"ᵻ","79":"kˌ","80":"vˈ","81":"fˌ","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"tˌ","86":"pˈ","87":"ðˌ","88":"sˌ","89":"nˌ","90":"lˌ","91":"̩","92":"ʔ","93":"vˌ","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"jˌ","101":"uːˈ","102":"iːˈ","103":"zˌ","104":".ˈ","105":"…","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"iˌ","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"-ˌ","127":"ɫ","128":"q","129":"—","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":".ˌ","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"?ˌ","150":",ˌ","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"!ˌ","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"qˌ","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"”","180":"“","181":"“ˈ","182":"“ˌ","183":";ˈ","184":";ˌ","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"ᵝ","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"oˌ","221":"eˈ","222":"ʍ","223":"eˌ","224":"uˌ","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"} - - root = str(cfg.data_dir) - - def add( type="training", audios=True, texts=True ): - for group in tqdm( hf[f'{type}/data/'].keys(), desc=f"Processing {type}"): - for name in tqdm( hf[f'{type}/data/{group}'].keys(), desc=f"Processing {group}"): - (cfg.data_dir / group / name).mkdir(parents=True, exist_ok=True) - - for id in tqdm( hf[f'{type}/data/{group}/{name}'].keys(), desc=f"Processing {name}"): - try: - key = f'{type}/data/{group}/{name}/{id}' - - if key not in hf: - tqdm.write(f'Missing key: {key}') - continue - - audio_exists = "audio" in hf[key] - text_exists = "text" in hf[key] - - if not audio_exists or not text_exists: - tqdm.write(f'Missing audio/text: {key}') - continue - - audio_path = Path(f'{root}/{group}/{name}/{id}.enc') - text_path = Path(f'{root}/{group}/{name}/{id}.json') - - # audio - if audios and audio_exists and not audio_path.exists(): - qnt = hf[key]["audio"][:, :] - torch.save( qnt, audio_path ) - - # text - if texts and text_exists and not text_path.exists(): - tokens = hf[key]["text"][:][1:-1] - phones = [ reverse_symmap[f'{token}'] for token in tokens ] - phones = list("".join(phones).replace(" ", " ")) - - j = { - "text": "", - "phonemes": phones, - "language": "en" - } - - with open(text_path, "w", encoding="utf-8") as f: - f.write( json.dumps( j ) ) - - except Exception as e: - raise e - - add( type="training" ) - add( type="validation" ) - add( type="noise", texts=False ) - - hf.close() - -def retokenize_dataset_hdf5( skip_existing=True ): - cfg.dataset.use_hdf5 = True - cfg.load_hdf5(write=True) - hf = cfg.hdf5 - - symmap = get_phone_symmap() - reverse_symmap = {"1":"","2":"","3":" ","4":".","5":",","6":"!","7":"p","8":"iː","9":"ɚ","10":"ˌ","11":"dˌ","12":"mˌ","13":"d","14":"ɹ","15":"tˈ","16":"pˌ","17":"uː","18":"l","19":"æ","20":"ɛ","21":"ɪ","22":"j","23":"ʊ","24":"t","25":"n","26":"v","27":"a","28":"o","29":"ŋ","30":"w","31":"ʌ","32":"hˈ","33":"ɡˈ","34":"ə","35":"θˈ","36":"dˈ","37":"wˌ","38":"h","39":"z","40":"k","41":"ð","42":"ɡˌ","43":"ˈ","44":"fˈ","45":"i","46":"s","47":"ʃ","48":"wˈ","49":"ðˈ","50":"ɹˈ","51":"lˈ","52":"ɡ","53":"oː","54":"mˈ","55":"e","56":"ɑː","57":"nˈ","58":"m","59":"θˌ","60":"sˈ","61":"f","62":"ɔː","63":"hˌ","64":"b","65":"jˈ","66":"ɐ","67":"ʒˈ","68":"θ","69":"bˈ","70":"ɾ","71":"ɜː","72":"ʌˈ","73":"ʃˌ","74":"bˌ","75":"kˈ","76":"ɔ","77":"zˈ","78":"ᵻ","79":"kˌ","80":"vˈ","81":"fˌ","82":"ʒ","83":"ʃˈ","84":"ɹˌ","85":"tˌ","86":"pˈ","87":"ðˌ","88":"sˌ","89":"nˌ","90":"lˌ","91":"̩","92":"ʔ","93":"vˌ","94":"ɪˈ","95":"\"","96":"ɪˌ","97":"ʒˌ","98":"uːˌ","99":"ʊˈ","100":"jˌ","101":"uːˈ","102":"iːˈ","103":"zˌ","104":".ˈ","105":"…","106":"ŋˌ","107":"ɐˌ","108":"—ˈ","109":"iˌ","110":"iːˌ","111":"ɛː","112":")","113":")ˈ","114":"(","115":"u","116":"-","117":"ɖˈ","118":"iˈ","119":"ʰˈ","120":"ɟˈ","121":"̃","122":"eː","123":"ɾˈ","124":"r","125":"ʰ","126":"-ˌ","127":"ɫ","128":"q","129":"—","130":"ʊˌ","131":"aː","132":"cˈ","133":"…ˈ","134":"c","135":"ɳ","136":"ɐˈ","137":"x","138":"ʔˌ","139":".ˌ","140":"ɑ","141":"?ˈ","142":"̩ˈ","143":"\"ˈ","144":",ˈ","145":"ŋˈ","146":"əˌ","147":"!ˈ","148":"\"ˌ","149":"?ˌ","150":",ˌ","151":"—ˌ","152":"̩ˌ","153":"əˈ","154":"!ˌ","155":"ɬ","156":"ʲ","157":"¡","158":"ɯ","159":"qˌ","160":"ʑ","161":"ʑˈ","162":"¿","163":"ɑːˈ","164":"iːː","165":"ɛˈ","166":"¡ˈ","167":"æˈ","168":"ç","169":"ɾˌ","170":"ᵻˈ","171":"xˈ","172":"ɔːˈ","173":";","174":"ɬˌ","175":":","176":"ʔˈ","177":"ɑːˌ","178":"ɬˈ","179":"”","180":"“","181":"“ˈ","182":"“ˌ","183":";ˈ","184":";ˌ","185":":ˈ","186":"1","187":"rˈ","188":"qˈ","189":"ᵻˌ","190":"ä","191":"̞ˌ","192":"̞","193":"ũˌ","194":"ʑˌ","195":"ᵝ","196":"ɽ","197":"ʲˌ","198":"ᵝˌ","199":"ũ","200":"ũˈ","201":"äˌ","202":"ɕ","203":"ɕˌ","204":"ɽˌ","205":"çˌ","206":"…ˌ","207":"̞ˈ","208":"äˈ","209":"ɽˈ","210":"ɸˌ","211":"ɴ","212":"ɸˈ","213":"ɕˈ","214":"ɸ","215":"ᵝˈ","216":"ʲˈ","217":"ĩ","218":"çˈ","219":"ĩˌ","220":"oˌ","221":"eˈ","222":"ʍ","223":"eˌ","224":"uˌ","225":"ʍˌ","226":"uˈ","227":"oˈ","228":"aˈ"} - - root = str(cfg.data_dir) - - def add( type="training" ): - for group in tqdm( hf[f'{type}/data/'].keys(), desc=f"Processing {type}"): - for name in tqdm( hf[f'{type}/data/{group}'].keys(), desc=f"Processing {group}"): - (cfg.data_dir / group / name).mkdir(parents=True, exist_ok=True) - - for id in tqdm( hf[f'{type}/data/{group}/{name}'].keys(), desc=f"Processing {name}"): - try: - key = f'{type}/data/{group}/{name}/{id}' - - if key not in hf: - tqdm.write(f'Missing key: {key}') - continue - - if "text" not in hf[key]: - tqdm.write(f'Missing text: {key}') - continue - - # text - tokens = hf[key]["text"][:][1:-1] - content = list("".join([ reverse_symmap[f'{token}'] for token in tokens ]).replace(" ", " ")) - - tokens = cfg.tokenizer.encode("".join(content)) - tokens = np.array(tokens).astype(np.uint8) - - del hf[key]['text'] - hf[key].create_dataset('text', data=tokens, compression='lzf') - - except Exception as e: - raise e - - add( type="training" ) - add( type="validation" ) - - # write symmap - if "symmap" in hf: - del hf['symmap'] - - hf.create_dataset('symmap', data=json.dumps(symmap)) - hf.close() - if __name__ == "__main__": import argparse @@ -1135,10 +984,6 @@ if __name__ == "__main__": if args.action == "hdf5": create_dataset_hdf5() - if args.action == "extract-hdf5": - extract_dataset_hdf5() - if args.action == "retokenize-hdf5": - retokenize_dataset_hdf5() elif args.action == "list-dataset": dataset = [] for group in os.listdir(cfg.data_dir): @@ -1147,7 +992,7 @@ if __name__ == "__main__": continue dataset.append(f'{group}/{name}') - print(dataset) + print(json.dumps(dataset)) elif args.action == "metadata": create_dataset_metadata() elif args.action == "sample":