vall-e/scripts/train_tokenizer.py

"""
# Helper script to grab all phonemes through parsed dataset metadata to find the "best" tokenizer dict
"""

import os
import json
import torch
import torchaudio

from tqdm.auto import tqdm
from pathlib import Path

from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

from vall_e.config import cfg
from vall_e.utils.io import json_read
from vall_e.emb.g2p import coerce_to_hiragana

input_metadata = "training/metadata/"

output_file = Path("./training/tokenizer_pretraining_data.json")
tokenizer_data = []

def pad(num, zeroes):
	return str(num).zfill(zeroes+1)

def add( dir, type="training", audios=True, texts=True ):
	name = str(dir)
	name = name.replace(str(cfg.data_dir), "")
	speaker_name = name
	"""
	if "LibriTTS-R" in speaker_name:
		speaker_name = speaker_name.replace("LibriTTS-R", "LibriVox")
	"""

	metadata_path = cfg.metadata_dir / f'{speaker_name}.json'
	metadata = json_read( metadata_path, default={} )

	for k, entry in metadata.items():
		if "text" not in entry:
			continue

		language = entry.get('language','auto')
		text = entry['text']
		tokenizer_data.append( text )

if output_file.exists():
	tokenizer_data = json.loads(open(str(output_file), "r", encoding="utf-8").read())
else:
	# training
	for data_dir in tqdm(sorted(cfg.dataset.training), desc="Processing Training"):
		try:
			add( data_dir, type="training" )
		except Exception as e:
			pass

	# validation
	for data_dir in tqdm(sorted(cfg.dataset.validation), desc='Processing Validation'):
		try:
			add( data_dir, type="validation" )
		except Exception as e:
			pass
	"""
	for dataset_name in os.listdir(f'./{input_metadata}/'):
		if not os.path.isdir(f'./{input_metadata}/{dataset_name}/'):
			continue

		for speaker_id in tqdm(os.listdir(f'./{input_metadata}/{dataset_name}/'), desc="Processing speaker"):
			if not os.path.isdir(f'./{input_metadata}/{dataset_name}/{speaker_id}'):
				continue
					
			for id in os.listdir(f'./{input_metadata}/{dataset_name}/{speaker_id}/'):
				if ".json" not in id:
					continue

				metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/{id}')
				metadata = json.loads(open(metadata_path, "r", encoding="utf-8").read())

				if "text" not in metadata:
					continue

				tokenizer_data.append( f'{"".join(metadata["text"])}' )

	open(output_file, 'w', encoding='utf-8').write(json.dumps(tokenizer_data))
	"""

unk_token = "<unk>"
spl_tokens = [unk_token, "<bos>", "</eos>", "<mask>", "<space>"]

trainer = BpeTrainer(special_tokens = spl_tokens, vocab_size = 32768, max_token_length=1, min_frequency=len(tokenizer_data))
tokenizer = Tokenizer(BPE(unk_token = unk_token))
tokenizer.pre_tokenizer = Whitespace() # takes 2 hours to process without this, we'll just manually add spaces as a token
tokenizer.post_processor = TemplateProcessing(
    single="<bos> $A <eos>",
    special_tokens=[("<bos>", 1), ("<eos>", 2)],
)

tokenizer.train_from_iterator(tokenizer_data, trainer=trainer)
tokenizer.save("./training/tokenizer_training_data.json")
re-adapted process_libritts.py to a 'better' way (better because it processed without needing to shuffle a bunch of things and adapt to cope or something) 2024-08-06 01:34:58 +00:00			`"""`
			`# Helper script to grab all phonemes through parsed dataset metadata to find the "best" tokenizer dict`
			`"""`

dataset preparation script updates, caved and am using HF tokenizer now 2024-04-21 19:49:18 +00:00			`import os`
			`import json`
			`import torch`
			`import torchaudio`

			`from tqdm.auto import tqdm`
			`from pathlib import Path`

			`from tokenizers import Tokenizer`
			`from tokenizers.models import BPE, Unigram, WordLevel, WordPiece`
			`from tokenizers.trainers import BpeTrainer`
			`from tokenizers.pre_tokenizers import Whitespace`
			`from tokenizers.processors import TemplateProcessing`

experimental 2025-01-05 18:47:03 +00:00			`from vall_e.config import cfg`
			`from vall_e.utils.io import json_read`
			`from vall_e.emb.g2p import coerce_to_hiragana`
dataset preparation script updates, caved and am using HF tokenizer now 2024-04-21 19:49:18 +00:00
experimental 2025-01-05 18:47:03 +00:00			`input_metadata = "training/metadata/"`

			`output_file = Path("./training/tokenizer_pretraining_data.json")`
dataset preparation script updates, caved and am using HF tokenizer now 2024-04-21 19:49:18 +00:00			`tokenizer_data = []`

			`def pad(num, zeroes):`
			`return str(num).zfill(zeroes+1)`

experimental 2025-01-05 18:47:03 +00:00			`def add( dir, type="training", audios=True, texts=True ):`
			`name = str(dir)`
			`name = name.replace(str(cfg.data_dir), "")`
			`speaker_name = name`
			`"""`
			`if "LibriTTS-R" in speaker_name:`
			`speaker_name = speaker_name.replace("LibriTTS-R", "LibriVox")`
			`"""`

			`metadata_path = cfg.metadata_dir / f'{speaker_name}.json'`
			`metadata = json_read( metadata_path, default={} )`

			`for k, entry in metadata.items():`
			`if "text" not in entry:`
			`continue`

			`language = entry.get('language','auto')`
			`text = entry['text']`
			`tokenizer_data.append( text )`

dataset preparation script updates, caved and am using HF tokenizer now 2024-04-21 19:49:18 +00:00			`if output_file.exists():`
			`tokenizer_data = json.loads(open(str(output_file), "r", encoding="utf-8").read())`
			`else:`
experimental 2025-01-05 18:47:03 +00:00			`# training`
			`for data_dir in tqdm(sorted(cfg.dataset.training), desc="Processing Training"):`
			`try:`
			`add( data_dir, type="training" )`
			`except Exception as e:`
			`pass`

			`# validation`
			`for data_dir in tqdm(sorted(cfg.dataset.validation), desc='Processing Validation'):`
			`try:`
			`add( data_dir, type="validation" )`
			`except Exception as e:`
			`pass`
			`"""`
dataset preparation script updates, caved and am using HF tokenizer now 2024-04-21 19:49:18 +00:00			`for dataset_name in os.listdir(f'./{input_metadata}/'):`
			`if not os.path.isdir(f'./{input_metadata}/{dataset_name}/'):`
			`continue`

			`for speaker_id in tqdm(os.listdir(f'./{input_metadata}/{dataset_name}/'), desc="Processing speaker"):`
			`if not os.path.isdir(f'./{input_metadata}/{dataset_name}/{speaker_id}'):`
			`continue`

			`for id in os.listdir(f'./{input_metadata}/{dataset_name}/{speaker_id}/'):`
			`if ".json" not in id:`
			`continue`

			`metadata_path = Path(f'./{input_metadata}/{dataset_name}/{speaker_id}/{id}')`
			`metadata = json.loads(open(metadata_path, "r", encoding="utf-8").read())`

experimental 2025-01-05 18:47:03 +00:00			`if "text" not in metadata:`
final tweaks, hopefully, again 2024-05-16 04:04:19 +00:00			`continue`

experimental 2025-01-05 18:47:03 +00:00			`tokenizer_data.append( f'{"".join(metadata["text"])}' )`
dataset preparation script updates, caved and am using HF tokenizer now 2024-04-21 19:49:18 +00:00
			`open(output_file, 'w', encoding='utf-8').write(json.dumps(tokenizer_data))`
experimental 2025-01-05 18:47:03 +00:00			`"""`
dataset preparation script updates, caved and am using HF tokenizer now 2024-04-21 19:49:18 +00:00
			`unk_token = "<unk>"`
final tweaks, hopefully, again 2024-05-16 04:04:19 +00:00			`spl_tokens = [unk_token, "<bos>", "</eos>", "<mask>", "<space>"]`
dataset preparation script updates, caved and am using HF tokenizer now 2024-04-21 19:49:18 +00:00
experimental 2025-01-05 18:47:03 +00:00			`trainer = BpeTrainer(special_tokens = spl_tokens, vocab_size = 32768, max_token_length=1, min_frequency=len(tokenizer_data))`
dataset preparation script updates, caved and am using HF tokenizer now 2024-04-21 19:49:18 +00:00			`tokenizer = Tokenizer(BPE(unk_token = unk_token))`
final tweaks, hopefully, again 2024-05-16 04:04:19 +00:00			`tokenizer.pre_tokenizer = Whitespace() # takes 2 hours to process without this, we'll just manually add spaces as a token`
dataset preparation script updates, caved and am using HF tokenizer now 2024-04-21 19:49:18 +00:00			`tokenizer.post_processor = TemplateProcessing(`
			`single="<bos> $A <eos>",`
			`special_tokens=[("<bos>", 1), ("<eos>", 2)],`
			`)`

			`tokenizer.train_from_iterator(tokenizer_data, trainer=trainer)`
final tweaks, hopefully, again 2024-05-16 04:04:19 +00:00			`tokenizer.save("./training/tokenizer_training_data.json")`