limit eval size because the training batch size seems to be used for the eval dataloader, somehow (bandaid)

2024-06-29 09:11:28 -05:00 · 2024-06-29 09:11:28 -05:00 · dd40463803
commit dd40463803
parent 591d3ac848
6 changed files with 31 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -23,8 +23,7 @@ I've tested this repo under Python versions `3.10.9`, `3.11.3`, and `3.12.3`.

 ## Pre-Trained Model

-> [!NOTE]
-> Pre-Trained weights aren't up to par as a pure zero-shot model at the moment, but are fine for finetuning / LoRAs.
+> [!NOTE] Pre-Trained weights aren't up to par as a pure zero-shot model at the moment, but are fine for finetuning / LoRAs.

 My pre-trained weights can be acquired from [here](https://huggingface.co/ecker/vall-e).

--- a/data/config.yaml
+++ b/data/config.yaml
@ -1,4 +1,3 @@
-experimental: False # should probably expand this into a dict of experimental flags
 sample_rate: 24_000 # 44_000 for dac
 audio_backend: "vocos" # or dac

@ -131,7 +130,9 @@ dataset:
  max_resps: 1
  p_resp_append: 0.25

-  sample_type: path # speaker
+  sample_type: path # path | speaker | group
+  sample_order: duration # shuffle | duration 
+  sample_max_duration_batch: 0 # used when above = duration, 120 seconds per batch at 12GiB of VRAM works

  tasks_list: [ "tts" ] # , [ "tts", "tts-c", "ns", "sr", "tse", "cse", "nse", "tts"]

--- a/vall_e/data.py
+++ b/vall_e/data.py
@ -547,7 +547,7 @@ class Dataset(_Dataset):

 		if self.sampler_type == "path":
 			if self.sampler_order == "duration" and cfg.dataset.sample_max_duration_batch > 0:
-				self.sampler = BatchedOrderedSampler( self.duration_buckets, cfg.dataset.sample_max_duration_batch, cfg.hyperparameters.batch_size if training else cfg.evaluation.batch_size )
+				self.sampler = BatchedOrderedSampler( self.duration_buckets, cfg.dataset.sample_max_duration_batch, cfg.hyperparameters.batch_size if self.training else cfg.evaluation.batch_size )
 			else:
 				self.sampler = OrderedSampler( len(self) )
 			self.samplers = {}
--- a/vall_e/engines/init.py
+++ b/vall_e/engines/init.py
@ -159,13 +159,11 @@ def load_engines(training=True):
 			for k in erase:
 				del state[k]

-			# resize text embedding
-			if "text_emb.weight" in state and model.config.text_tokens != state["text_emb.weight"].shape[0]:
-				state["text_emb.weight"] = state["text_emb.weight"][:model.config.text_tokens]
-
-			# resize text embedding
-			if "rvq_l_emb.weight" in state and model.config.resp_levels != state["rvq_l_emb.weight"].shape[0]:
-				state["rvq_l_emb.weight"] = state["rvq_l_emb.weight"][:model.config.resp_levels]
+			# resize embeddings
+			if "text_emb.weight" in state:
+				state["text_emb.weight"] = ml.resize_weight( state["text_emb.weight"], model.config.text_tokens )
+			if "rvq_l_emb.weight" in state:
+				state["rvq_l_emb.weight"] = ml.resize_weight( state["rvq_l_emb.weight"], model.config.resp_levels )

 			model.load_state_dict(state, strict=cfg.trainer.strict_loading)

--- a/vall_e/train.py
+++ b/vall_e/train.py
@ -30,7 +30,7 @@ def train_feeder(engine, batch):
 	with torch.autocast("cuda", dtype=cfg.trainer.dtype, enabled=cfg.trainer.amp):
 		batch_size = len(batch["text"])
 		engine.current_batch_size = batch_size
-		
+
 		if engine.hyper_config.experimental:
 			if cfg.model.interleave:
 				quant_levels = 0	
@ -116,7 +116,12 @@ def run_eval(engines, eval_name, dl):
 	
 	processed = 0
 	while processed < cfg.evaluation.size:
-		batch: dict = to_device(next(iter(dl)), cfg.device)
+		batch = to_device(next(iter(dl)), cfg.device)
+
+		# limit to eval batch size in the event we somehow have a weird dataloader
+		for key in batch.keys():
+			batch[key] = batch[key][:cfg.evaluation.batch_size]
+
 		processed += len(batch["text"])

 		for name in engines:
--- a/vall_e/utils/wrapper.py
+++ b/vall_e/utils/wrapper.py
@ -212,6 +212,20 @@ def replace_attention( model, klass, target, mode="math", verbose=False ):

 	return model

+# trim/expand a tensor (for example, in a state dict)
+def resize_weight( weight, target ):
+	# trim
+	if target < weight.shape[0]:
+		return weight[:target]
+	# expand
+	if target > weight.shape[0]:
+		return torch.stack(
+			[ x for x in weight ] +
+			[ torch.rand( weight[0].shape ).to(device=weight[0].device, dtype=weight[0].dtype) for _ in range( target - weight.shape[0] ) ]
+		)
+
+	return weight
+
 # https://github.com/konstmish/prodigy
 try:
 	from prodigyopt import Prodigy