tweaks, fixes, cleanup, added reporting accuracy/precision from the VALL-E trainer (which indirectly revealed a grody bug in the VALL-E trainer), some other cr*p

2023-08-05 22:42:05 +00:00 · 2023-08-05 22:42:05 +00:00 · 93987ea5d6
commit 93987ea5d6
parent 77a9625e93
11 changed files with 56 additions and 124 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,6 @@ __pycache__
 /.cache
 /config
 /*.egg-info
-/vall_e/version.py
+/image_classifier/version.py
 /build
 /.cache
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@ This is a simple ResNet based image classifier for """specific images""", using
 ## Premise
-This was cobbled together in a night, partly to test how well my training framework fares when not married to my VALL-E implementation, and partly to solve a problem I have recently faced. Since I've been balls deep in learning the ins and outs of making VALL-E work, why not do the exact opposite (a tiny, image classification model of fixed lengths) to test the framework and my knowledge? Thus, this """ambiguous""" project is born.
+This was cobbled together in a night, partly to test how well my training framework fares when not married to my VALL-E implementation, and partly to solve a minor problem I have recently faced. Since I've been balls deep in learning the ins and outs of making VALL-E work, why not do the exact opposite (a tiny, image classification model of fixed lengths) to test the framework and my knowledge? Thus, this """ambiguous""" project is born.
 This is by no ways state of the art, as it just leverages an existing ResNet arch provided by `torchvision`.
@ -22,13 +22,17 @@ This is by no ways state of the art, as it just leverages an existing ResNet arc
 ## Inferencing
-Simply invoke the inferencer with the following command: `python3 -m image_classifier "./data/path-to-your-image.png" yaml="./data/config.yaml" --temp=1.0`
+Simply invoke the inferencer with the following command: `python3 -m image_classifier --path="./data/path-to-your-image.png" yaml="./data/config.yaml" --temp=1.0`
 ### Continuous Usage
 If you're looking to continuously classifier trained images, use `python3 -m image_classifier --listen --port=7860 yaml="./data/config.yaml" --temp=1.0` instead to enable a light webserver using `simple_http_server`. Send a `GET` request to `http://127.0.0.1:7860/?b64={base64 encoded image string}` and a JSON response will be returned with the classified label.
 ## Known Issues
 * Setting `dataset.workers` higher than 0 will cause issues when using the local engine backend. Use DeepSpeed.
 * The evaluation / validation routine doesn't quite work.
 * Using `float16` with the local engine backend will cause instability in the losses. Use DeepSpeed.
 * Web server doesn't emit `content-type: application/json`, nor accepts JSON `POST`s at the moment.
 ## Strawmen
--- a/data/config.yaml
+++ b/data/config.yaml
@ -1,6 +1,6 @@
 dataset:
  training: [
-    "./data/captchas/"
+    "./data/images/"
  ]
  validation: []
@ -12,17 +12,17 @@ dataset:
 models:
  _models:
-  - name: "captcha"
+  - name: "classifier"
    tokens: 0
    len: 6
 hyperparameters:
  batch_size: 256
-  gradient_accumulation_steps: 5
+  gradient_accumulation_steps: 64
  gradient_clipping: 100
  optimizer: Adamw
-  learning_rate: 5.0e-5
+  learning_rate: 1.0e-3
  scheduler_type: ""
  #scheduler_type: OneCycle
--- a/image_classifier/main.py
+++ b/image_classifier/main.py
@ -3,13 +3,13 @@ import base64
 from io import BytesIO
 from pathlib import Path
-from .inference import CAPTCHA
+from .inference import Classifier
 from PIL import Image
 from simple_http_server import route, server
 def main():
-	parser = argparse.ArgumentParser("CAPTCHA", allow_abbrev=False)
+	parser = argparse.ArgumentParser(allow_abbrev=False)
 	parser.add_argument("--listen", action='store_true')
 	parser.add_argument("--port", type=int, default=9090)
 	parser.add_argument("--yaml", type=Path, default=None)
@ -18,15 +18,15 @@ def main():
 	parser.add_argument("--device", default="cuda")
 	args, unknown = parser.parse_known_args()
-	captcha = CAPTCHA( config=args.yaml, ckpt=args.ckpt, device=args.device )
+	classifier = Classifier( config=args.yaml, ckpt=args.ckpt, device=args.device )
 	if args.listen:
 		@route("/")
 		def inference( b64, temperature=1.0 ):
 			image = Image.open(BytesIO(base64.b64decode(b64))).convert("RGB")
-			return { "answer": captcha.inference( image=image, temperature=args.temp ) }
+			return { "answer": classifier.inference( image=image, temperature=args.temp ) }
 		server.start(port=args.port)
 	else:
-		parser = argparse.ArgumentParser("CAPTCHA", allow_abbrev=False)
+		parser = argparse.ArgumentParser(allow_abbrev=False)
 		parser.add_argument("--path", type=Path)
 		parser.add_argument("--base64", type=str)
 		parser.add_argument("--temp", type=float, default=1.0)
@ -39,7 +39,7 @@ def main():
 		else:
 			raise "Specify a --path or --base64."
-		answer = captcha.inference( image=image, temperature=args.temp )
+		answer = classifier.inference( image=image, temperature=args.temp )
 		print("Answer:", answer)
 if __name__ == "__main__":
--- a/image_classifier/config.py
+++ b/image_classifier/config.py
@ -111,6 +111,7 @@ class Dataset:
 	temp: list[Path] = field(default_factory=lambda: [])
 	# de-implemented, because the data isn't that large to facilitate HDF5
 	hdf5_name: str = "data.h5"
 	use_hdf5: bool = False
@ -149,12 +150,12 @@ class Models:
 class Hyperparameters:
 	batch_size: int = 8
 	gradient_accumulation_steps: int = 32
-	gradient_clipping: int = 100
+	gradient_clipping: int = 100 # to be implemented in the local backend
 	optimizer: str = "Adamw"
 	learning_rate: float = 3.25e-4
-	scheduler_type: str = ""
+	scheduler_type: str = "" # to be implemented in the local backend
 	scheduler_params: dict = field(default_factory=lambda: {})
@dataclass()
@ -317,7 +318,7 @@ class Trainer:
@dataclass()
 class Inference:
-	use_vocos: bool = True
+	use_vocos: bool = True # artifact from the VALL-E trainer
@dataclass()
 class BitsAndBytes:
--- a/image_classifier/data.py
+++ b/image_classifier/data.py
@ -1,7 +1,7 @@
 # todo: clean this mess up
 import copy
-import h5py
+# import h5py
 import json
 import logging
 import numpy as np
@ -60,7 +60,7 @@ class Dataset(_Dataset):
 		self.training = training
 		self.transform = transforms.Compose([
-			transforms.Resize((self.height, self.width)),
+			#transforms.Resize((self.height, self.width)), # for some reason, running the validation dataset breaks when this is set. all images *should* be normalized anyhow
 			transforms.ToTensor(),
 			transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
 		])
@ -73,13 +73,14 @@ class Dataset(_Dataset):
 	def __getitem__(self, index):
 		path = self.paths[index]
 		# stupid try/except when the original VALL-E training framework was able to insert foreign symbols into the symmap, but that functionality isn't really necessary here
 		try:
 			text = torch.tensor([*map(self.symmap.get, _get_symbols(path.stem))]).to(torch.uint8)
 		except Exception as e:
 			print("Invalid symbol:", _get_symbols(path.stem), [*map(self.symmap.get, _get_symbols(path.stem))], path.stem)
 			raise e
-		image = self.transform(Image.open(path).convert('RGB')).to(cfg.trainer.dtype)
+		image = self.transform(Image.open(path).convert('RGB')).to(cfg.trainer.dtype) # resnet has to be RGB
 		return dict(
 			index=index,
@ -192,7 +193,7 @@ def create_train_val_dataloader():
 	subtrain_dataset = copy.deepcopy(train_dataset)
 	subtrain_dataset.head_(cfg.evaluation.size)
-	#subtrain_dataset.training_(False)
+	subtrain_dataset.training_(False)
 	train_dl = _create_dataloader(train_dataset, training=True)
 	val_dl = _create_dataloader(val_dataset, training=False)
@ -209,9 +210,11 @@ def create_train_val_dataloader():
 	return train_dl, subtrain_dl, val_dl
 """
 if __name__ == "__main__":
 	create_dataset_hdf5()
 	train_dl, subtrain_dl, val_dl = create_train_val_dataloader()
 	sample = train_dl.dataset[0]
 	print(sample)
 """
--- a/image_classifier/engines/base.py
+++ b/image_classifier/engines/base.py
@ -48,6 +48,7 @@ if not distributed_initialized() and cfg.trainer.backend == "local":
 	init_distributed(torch.distributed.init_process_group)
 # A very naive engine implementation using barebones PyTorch
 # to-do: implement lr_sheduling
 class Engine():
 	def __init__(self, *args, **kwargs):
 		self.module = kwargs['model'].to(cfg.device).to(cfg.trainer.dtype)
--- a/image_classifier/inference.py
+++ b/image_classifier/inference.py
@ -6,7 +6,7 @@ from .config import cfg
 from .export import load_models
 from .data import get_symmap, _get_symbols
-class CAPTCHA():
+class Classifier():
 	def __init__( self, width=300, height=80, config=None, ckpt=None, device="cuda", dtype="float32" ):
 		self.loading = True 
 		self.device = device
@ -48,6 +48,6 @@ class CAPTCHA():
 		image = self.transform(image).to(self.device)
 		answer = self.model( image=[image], sampling_temperature=temperature )
-		answer = answer[0].replace('<s>', "").replace("</s>", "")
+		answer = answer[0].replace('<s>', "").replace("</s>", "") # it would be better to just slice between these, but I can't be assed
 		return answer
--- a/image_classifier/models/base.py
+++ b/image_classifier/models/base.py
@ -16,28 +16,6 @@ from torchvision.models import resnet18
 from ..data import get_symmap
 def _create_mask(l, device):
 	"""1 is valid region and 0 is invalid."""
 	seq = torch.arange(max(l), device=device).unsqueeze(0)  # (1 t)
 	stop = torch.tensor(l, device=device).unsqueeze(1)  # (b 1)
 	return (seq < stop).float()  # (b t)
 def list_to_tensor(x_list: list[Tensor], pattern="t b c -> b t c"):
 	"""
 	Args:
 		x_list: [(t d)]
 	Returns:
 		x: (? ? ?)
 		m: (? ? ?), same as x
 	"""
 	l = list(map(len, x_list))
 	x = rearrange(pad_sequence(x_list), pattern)
 	m = _create_mask(l, x_list[0].device)
 	m = m.t().unsqueeze(-1)  # (t b 1)
 	m = rearrange(m, pattern)
 	m = m.to(x)
 	return x, m
 class Model(nn.Module):
 	def __init__(
 		self,
@ -61,9 +39,20 @@ class Model(nn.Module):
 		self.resnet = resnet18(pretrained=False)
 		self.resnet.fc = nn.Linear( self.d_model, self.n_tokens * self.n_len )
 		self.criterion = nn.CTCLoss(zero_infinity=True)
 		self.accuracy_metric = MulticlassAccuracy(
 			n_tokens,
 			#top_k=10,
 			average="micro",
 			multidim_average="global",
 		)
 		self.precision_metric = MulticlassPrecision(
 			n_tokens,
 			#top_k=10,
 			average="micro",
 			multidim_average="global",
 		)
 	def forward(
 		self,
@ -77,6 +66,7 @@ class Model(nn.Module):
 		x = self.resnet( x_list )
 		y = x.view(x.size(0), self.n_len, self.n_tokens)
 		# either of these should do, but my VALL-E forward pass uses this, so might as well keep to it
 		# pred = y.argmax(dim=2)
 		pred = Categorical(logits=y / sampling_temperature).sample()
@ -84,87 +74,20 @@ class Model(nn.Module):
 		if text is not None:
 			y_list = rearrange(pad_sequence(text), "t b -> b t")
-			
+
 			loss = 0
 			for i in range(self.n_len):
 				if i >= y_list.shape[1]:
 					break
 				loss += F.cross_entropy( y[:, i], y_list[:, i] )
 			self.loss = dict(
 				nll=loss
 			)
-		return answer
+			self.stats = dict(
 				acc = self.accuracy_metric( pred, y_list ),
 				precision = self.precision_metric( pred, y_list ),
 			)
-def example_usage():
+		return answer
 	from ..config import cfg
 	cfg.trainer.backend = "local"
 	cfg.trainer.check_for_oom = False
 	from functools import partial
 	from einops import repeat
 	from ..emb.qnt import decode_to_file
 	from ..engines import Engine, Engines
 	from tqdm import tqdm, trange
 	from .ar import AR
 	from .nar import NAR
 	device = "cpu"
 	x8 = partial(repeat, pattern="t -> t l", l=2) 
 	symmap = {'<s>': 1, '</s>': 2, ' ': 3, '.': 4, ',': 5, '!': 6, '?': 7, 'p': 7, 'iː': 8, 'ɚ': 9, 'ˌ': 10, 'dˌ': 11, 'mˌ': 12, 'd': 13, 'ɹ': 14, 'tˈ': 15, 'pˌ': 16, 'uː': 17, 'l': 18, 'æ': 19, 'ɛ': 20, 'ɪ': 21, 'j': 22, 'ʊ': 23, 't': 24, 'n': 25, 'v': 26, 'a': 27, 'o': 28, 'ŋ': 29, 'w': 30, 'ʌ': 31, 'hˈ': 32, 'ɡˈ': 33, 'ə': 34, 'θˈ': 35, 'dˈ': 36, 'wˌ': 37, 'h': 38, 'z': 39, 'k': 40, 'ð': 41, 'ɡˌ': 42, 'ˈ': 43, 'fˈ': 44, 'i': 45, 's': 46, 'ʃ': 47, 'wˈ': 48, 'ðˈ': 49, 'ɹˈ': 50, 'lˈ': 51, 'ɡ': 52, 'oː': 53, 'mˈ': 54, 'e': 55, 'ɑː': 56, 'nˈ': 57, 'm': 58, 'θˌ': 59, 'sˈ': 60, 'f': 61, 'ɔː': 62, 'hˌ': 63, 'b': 64, 'jˈ': 65, 'ɐ': 66, 'ʒˈ': 67, 'θ': 68, 'bˈ': 69, 'ɾ': 70, 'ɜː': 71, 'ʌˈ': 72, 'ʃˌ': 73, 'bˌ': 74, 'kˈ': 75, 'ɔ': 76, 'zˈ': 77, 'ᵻ': 78, 'kˌ': 79, 'vˈ': 80, 'fˌ': 81, 'ʒ': 82, 'ʃˈ': 83, 'ɹˌ': 84, 'tˌ': 85, 'pˈ': 86, 'ðˌ': 87, 'sˌ': 88, 'nˌ': 89, 'lˌ': 90, '̩': 91, 'ʔ': 92, 'vˌ': 93, 'ɪˈ': 94, '"': 95, 'ɪˌ': 96, 'ʒˌ': 97, 'uːˌ': 98, 'ʊˈ': 99, 'jˌ': 100, 'uːˈ': 101, 'iːˈ': 102, 'zˌ': 103, '.ˈ': 104, '…': 105, 'ŋˌ': 106, 'ɐˌ': 107, '—ˈ': 108, 'iˌ': 109, 'iːˌ': 110, 'ɛː': 111, ')': 112, ')ˈ': 113, '(': 114, 'u': 115, '-': 116, 'ɖˈ': 117, 'iˈ': 118, 'ʰˈ': 119, 'ɟˈ': 120, '̃': 121, 'eː': 122, 'ɾˈ': 123, 'r': 124, 'ʰ': 125, '-ˌ': 126, 'ɫ': 127, 'q': 128, '—': 129, 'ʊˌ': 130, 'aː': 131, 'cˈ': 132, '…ˈ': 133, 'c': 134, 'ɳ': 135, 'ɐˈ': 136, 'x': 137, 'ʔˌ': 138, '.ˌ': 139, 'ɑ': 140, '?ˈ': 141, '̩ˈ': 142, '"ˈ': 143, ',ˈ': 144, 'ŋˈ': 145, 'əˌ': 146, '!ˈ': 147, '"ˌ': 148, '?ˌ': 149, ',ˌ': 150, '—ˌ': 151, '̩ˌ': 152, 'əˈ': 153, '!ˌ': 154, 'ɬ': 155, 'ʲ': 156, '¡': 157, 'ɯ': 158, 'qˌ': 159, 'ʑ': 160, 'ʑˈ': 161, '¿': 162, 'ɑːˈ': 163, 'iːː': 164, 'ɛˈ': 165, '¡ˈ': 166, 'æˈ': 167, 'ç': 168, 'ɾˌ': 169, 'ᵻˈ': 170, 'xˈ': 171, 'ɔːˈ': 172, ';': 173, 'ɬˌ': 174, ':': 175, 'ʔˈ': 176, 'ɑːˌ': 177, 'ɬˈ': 178}
 	def tokenize(content, lang_marker="en"):
 		split = content.split(" ")
 		phones = [f"<s>"] + [ " " if not p else p for p in split ] + [f"</s>"]
 		return torch.tensor([*map(symmap.get, phones)]).to()
 	kwargs = {
 		'n_tokens': 1024,
 		'd_model': 1024,
 		'n_heads': 16,
 		'n_layers': 12,
 	}
 	models = { "ar": AR(**kwargs).to(device), "nar": NAR(**kwargs).to(device) }
 	engines = Engines({ name: Engine(model=model, optimizer=torch.optim.AdamW(model.parameters(), lr=1e-4)) for name, model in models.items() })
 	train = True
 	def sample( name, steps=400 ):
 		AR = None
 		NAR = None
 		engines.eval()
 		for name, engine in engines.items():
 			if name[:2] == "ar":
 				AR = engine
 			elif name[:3] == "nar":
 				NAR = engine
 		resps_list = AR(text_list, proms_list, max_steps=steps, sampling_temperature=1.0)
 		resps_list = [r.unsqueeze(-1) for r in resps_list]		
 		codes = NAR( text_list, proms_list, resps_list=resps_list, sampling_temperature=0.2 ) 
 		decode_to_file(resps_list[0], f"./data/ar.{name}.wav", device=device)
 		decode_to_file(codes[0], f"./data/ar+nar.{name}.wav", device=device)
 	if train:
 		sample("init", 15)
 		engines.train()
 		t = trange(60)
 		for i in t:
 			stats = engines.step({"text_list": text_list, "proms_list": proms_list, "resps_list": resps_list}, device="cpu")
 			t.set_description(f"{stats}")
 	else:
 		for name, engine in engines.items():
 			engine.module.load_state_dict(torch.load(f"./data/{name}.pth"))
 	sample("final")
 if __name__ == "__main__":
 	example_usage()
--- a/image_classifier/train.py
+++ b/image_classifier/train.py
@ -23,11 +23,13 @@ def train_feeder(engine, batch):
 	engine( image=batch["image"], text=batch["text"] )
 	losses = engine.gather_attribute("loss")
 	stat = engine.gather_attribute("stats")
 	loss = torch.stack([*losses.values()]).sum()
 	stats = {}
 	stats |= {k: v.item() for k, v in losses.items()}
 	stats |= {k: v.item() for k, v in stat.items()}
 	return loss, stats
@ -55,7 +57,6 @@ def run_eval(engines, eval_name, dl):
 	for batch in tqdm(dl):
 		batch: dict = to_device(batch, cfg.device)
 		# if we're training both models, provide output for both
 		res = model( image=batch['image'], text=batch['text'], sampling_temperature=cfg.evaluation.temperature )
 		for path, ref, hyp in zip(batch["path"], batch["text"], res):
--- a/image_classifier/version.py
+++ b/image_classifier/version.py
@ -1 +0,0 @@
 __version__ = "0.0.1-dev20230804142130"