diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 902d759..051af63 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3628,8 +3628,6 @@ template __global__ void kgemm_4bit_inferenc #pragma unroll for(int k = 0; k < num_values_4bit; k++) { - if((float)local_A[k] < -10.0f || (float)local_B[k] < -10.0f || local_C > 10.0f) - printf("%f %f = %f\n", (float)local_A[k], (float)local_B[k], local_C); #if __CUDA_ARCH__ >= 800 local_C += (float)(local_A[k]*local_B[k]); #else diff --git a/tests/test_functional.py b/tests/test_functional.py index e80eed3..9bcc3fa 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2442,7 +2442,7 @@ def test_gemv_4bit(dtype, storage_type, double_quant): assert sum(relerrs)/len(relerrs)/math.sqrt(dim) < 0.0005 elif dtype == torch.float32: assert sum(errs)/len(errs)/math.sqrt(dim) < 5e-8 - assert sum(relerrs)/len(relerrs)/math.sqrt(dim) < 1e-8 + assert sum(relerrs)/len(relerrs)/math.sqrt(dim) < 1e-7 elif dtype == torch.bfloat16: assert sum(errs)/len(errs)/math.sqrt(dim) < 3e-4 assert sum(relerrs)/len(relerrs)/math.sqrt(dim) < 0.003