mucked around with the loss calculation, this seems better?

This commit is contained in:
mrq 2023-10-13 18:22:21 -05:00
parent fb467b19ba
commit a539f6889f
2 changed files with 27 additions and 24 deletions

View File

@ -118,16 +118,24 @@ class AR_NAR(Base):
# is training
if n_levels == self.n_resp_levels:
# might be better to have this decided on the dataloader level
if cfg.models.ar_nar.p_ar_level == "auto" or cfg.models.ar_nar.p_ar_level is None:
quant_levels = torch.randint(0, self.n_resp_levels, (batch_size,)) # randomly select a target RVQ-bin level (0 being AR, 1+ being NAR)
else:
quant_levels = torch.Tensor([ [ 0 if random.random() < cfg.models.ar_nar.p_ar_level else random.randint(1, self.n_resp_levels) ] for _ in range(batch_size) ])
targ_list = [r[..., l] for r, l in zip(resps_list, quant_levels)] # ensures we only have 1 RVQ-bin (our target)
resps_list = [r if l == 0 else r[..., :l] for r, l in zip(resps_list, quant_levels)] # yes I can just do min(1, l)
resps_list = [r if l == 0 else r[..., :l] for r, l in zip(resps_list, quant_levels)] # r[..., 0] is technically correct, but only r[:, 0] gets passed through the embedding
if cfg.experimental:
proms_list = [ r if l == 0 else trim(r, 75 * 3) for r, l in zip(proms_list, quant_levels) ] # trim input prompt to 3 seconds
# append stop tokens for AR
for i in range(batch_size):
if quant_levels[i] > 0:
continue
resps_list[i] = torch.cat([resps_list[i], torch.Tensor([[self.stop_token] * n_levels]).to(device=device, dtype=torch.int16) ])
targ_list[i] = torch.cat([targ_list[i], torch.Tensor([self.stop_token]).to(device=device, dtype=torch.int16) ])
return super().forward(
text_list=text_list,
@ -294,6 +302,8 @@ def example_usage():
qnt = torch.load("data/qnt.pt")[0].t()[:, :cfg.models.prom_levels].to(device)
cfg.hyperparameters.gradient_accumulation_steps = 1
text_list = [
tokenize("ˈ a ɪ w ɪ l nˌ ɑː t ˈ æ s k ɐ sˈ ɛ k ə n d tˈ a ɪ m").to(device),
]
@ -323,10 +333,9 @@ def example_usage():
"""
model = AR_NAR(**kwargs).to(device)
#steps = 500
#optimizer = ml.Prodigy(model.parameters(), lr=1.0)
steps = 1000
optimizer = ml.AdamW(model.parameters(), lr=1.0e-4)
steps = 250
optimizer = ml.Prodigy(model.parameters(), lr=1.0)
#optimizer = ml.AdamW(model.parameters(), lr=1.0e-4)
engine = Engine(model=model, optimizer=optimizer)
torch.save( {

View File

@ -351,30 +351,24 @@ class Base(nn.Module):
# compute loss if the target is given
if targ_list is not None:
ignore_sep = torch.tensor(self.ignore_index, device=device)
# create a tensor sequence with one RVQ-bin of the input prompt, but with `ignore_index`, as the prompt is not neeeded for computing the loss against
prom_list = [ torch.full_like(t[..., 0], self.ignore_index) for t in proms_list ]
# remake input sequence
text_prom_list = self._samplewise_merge_tensors(
target_list = self._samplewise_merge_tensors(
text_list,
lang_list,
prom_list,
sep=ignore_sep
[ torch.full_like(t[..., 0], self.ignore_index) for t in proms_list ], # create a tensor sequence with one RVQ-bin of the input prompt, but with `ignore_index`, as the prompt is not neeeded for computing the loss against
targ_list,
sep=torch.tensor(self.ignore_index, device=device)
)
# process each batch
for i in range(len(text_prom_list)):
# for the AR and NAR, shift the text/input prompt into the future by 1, and ignore the rolled back token
text_prom_list[i] = text_prom_list[i].roll(-1, dims=0)
text_prom_list[i][-1] = self.ignore_index
# modify only for the AR so it can properly behave like a transformer
for i in range(len(target_list)):
if quant_levels is not None and quant_levels[i] > 0:
continue
# for the AR, shift the target response into the future by 1, and ignore the rolled back text token
if quant_levels is None or quant_levels[i] == 0:
targ_list[i] = targ_list[i].clone().roll(-1, dims=0) # clone ensures it's not an aliased copy/view of resps
targ_list[i][-1] = self.stop_token
logits[i] = logits[i][..., :-1, :] # shift the target so that token n...
target_list[i] = target_list[i][..., 1:] # predicts token n + 1
# create the new target sequence to compute the loss against
target = torch.cat( self._samplewise_merge_tensors( text_prom_list, targ_list, sep=ignore_sep ) )
target = torch.cat( target_list )
inputs = torch.cat( logits )
self.loss = dict(