tweaks
This commit is contained in:
parent
f3c59c3e7e
commit
91062361af
@ -199,7 +199,8 @@ class Engine():
|
|||||||
self._get_grad_norm()
|
self._get_grad_norm()
|
||||||
|
|
||||||
def _get_grad_norm(self):
|
def _get_grad_norm(self):
|
||||||
self._global_grad_norm = torch.cat([ param.grad.detach().flatten() for param in self.module.parameters() if param.grad is not None ]).norm().item()
|
t = [ param.grad.detach().flatten() for param in self.module.parameters() if param.grad is not None ]
|
||||||
|
self._global_grad_norm = torch.cat(t).norm().item() if len(t) else 0
|
||||||
|
|
||||||
def get_lr(self):
|
def get_lr(self):
|
||||||
lrs = []
|
lrs = []
|
||||||
|
|||||||
@ -336,7 +336,7 @@ def example_usage():
|
|||||||
proms_list = proms_list[:1]
|
proms_list = proms_list[:1]
|
||||||
resps_list = resps_list[:1]
|
resps_list = resps_list[:1]
|
||||||
|
|
||||||
"""
|
# rentet-full is the only configuration with BitNet's BitLinear that converges despite the grad_norm saying otherwise
|
||||||
kwargs = {
|
kwargs = {
|
||||||
'n_tokens': 1024,
|
'n_tokens': 1024,
|
||||||
'd_model': 1024, # 256, # 1024, # 1536
|
'd_model': 1024, # 256, # 1024, # 1536
|
||||||
@ -352,6 +352,7 @@ def example_usage():
|
|||||||
'n_layers': 12,
|
'n_layers': 12,
|
||||||
'n_experts': 8,
|
'n_experts': 8,
|
||||||
}
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user