Merge branch 'bugfix_cpu_and_cc' into main

2022-09-05 16:23:03 -07:00 · 2022-09-05 16:23:03 -07:00 · eab4d8232d
commit eab4d8232d
parent 5d9bc7a301 8e7053a988
7 changed files with 48 additions and 58 deletions
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@ -17,6 +17,7 @@ evaluation:
 """

 import ctypes
+import torch
 from pathlib import Path

 from ..utils import execute_and_return
@ -28,7 +29,7 @@ def check_cuda_result(cuda, result_val):
    if result_val != 0:
        error_str = ctypes.c_char_p()
        cuda.cuGetErrorString(result_val, ctypes.byref(error_str))
-        raise Exception(f"CUDA exception! Error code: {error_str.value.decode()}")
+        print(f"CUDA exception! Error code: {error_str.value.decode()}")

 def get_cuda_version(cuda, cudart_path):
    # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART____VERSION.html#group__CUDART____VERSION
@ -57,7 +58,7 @@ def get_cuda_lib_handle():
        cuda = ctypes.CDLL("libcuda.so")
    except OSError:
        # TODO: shouldn't we error or at least warn here?
-        raise Exception('CUDA SETUP: ERROR! libcuda.so not found! Do you have a CUDA driver installed? If you are on a cluster, make sure you are on a CUDA machine!')
+        print('CUDA SETUP: WARNING! libcuda.so not found! Do you have a CUDA driver installed? If you are on a cluster, make sure you are on a CUDA machine!')
        return None
    check_cuda_result(cuda, cuda.cuInit(0))

@ -119,6 +120,10 @@ def evaluate_cuda_setup():
    print('For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link')
    print('='*80)
    binary_name = "libbitsandbytes_cpu.so"
+    #if not torch.cuda.is_available():
+        #print('No GPU detected. Loading CPU library...')
+        #return binary_name
+
    cudart_path = determine_cuda_runtime_lib_path()
    if cudart_path is None:
        print(
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@ -185,14 +185,9 @@ def create_dynamic_map(signed=True, n=7):


 def get_special_format_str():
+    if not torch.cuda.is_available(): return 'col_turing'
    major, minor = torch.cuda.get_device_capability()
-    if major < 7:
-        print(
-            f"Device with CUDA capability of {major} not supported for 8-bit matmul. Device has no tensor cores!"
-        )
-        assert major >= 7
-
-    if major == 7:
+    if major <= 7:
        return "col_turing"
    elif major == 8:
        return "col_ampere"
@ -1685,21 +1680,6 @@ def double_quant(
    return out_row, out_col, row_stats, col_stats, coo_tensor


-def get_special_format_str():
-    major, minor = torch.cuda.get_device_capability()
-    if major < 7:
-        print(
-            f"Device with CUDA capability of {major} not supported for 8-bit matmul. Device has no tensor cores!"
-        )
-        assert major >= 7
-
-    if major == 7: return 'col_turing'
-    elif major == 8: return 'col_ampere'
-    else: return 'col_turing'
-
-
-
-
 def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
    prev_device = pre_call(A.device)
    if state is None: state = (A.shape, from_order)
--- a/bitsandbytes/optim/init.py
+++ b/bitsandbytes/optim/init.py
@ -5,13 +5,12 @@

 from bitsandbytes.cextension import COMPILED_WITH_CUDA

-if COMPILED_WITH_CUDA:
-    from .adam import Adam, Adam8bit, Adam32bit
-    from .adamw import AdamW, AdamW8bit, AdamW32bit
-    from .sgd import SGD, SGD8bit, SGD32bit
-    from .lars import LARS, LARS8bit, LARS32bit, PytorchLARS
-    from .lamb import LAMB, LAMB8bit, LAMB32bit
-    from .rmsprop import RMSprop, RMSprop8bit, RMSprop32bit
-    from .adagrad import Adagrad, Adagrad8bit, Adagrad32bit
+from .adam import Adam, Adam8bit, Adam32bit
+from .adamw import AdamW, AdamW8bit, AdamW32bit
+from .sgd import SGD, SGD8bit, SGD32bit
+from .lars import LARS, LARS8bit, LARS32bit, PytorchLARS
+from .lamb import LAMB, LAMB8bit, LAMB32bit
+from .rmsprop import RMSprop, RMSprop8bit, RMSprop32bit
+from .adagrad import Adagrad, Adagrad8bit, Adagrad32bit

 from .optimizer import GlobalOptimManager
--- a/csrc/ops.cu
+++ b/csrc/ops.cu
@ -371,7 +371,11 @@ template void transform<int32_t, COL32, ROW, false, 32>(cublasLtHandle_t ltHandl
 template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc) 
 {
 #ifdef NO_CUBLASLT
-  printf("ERROR: Your GPU does not support Int8 Matmul!");
+  cout << "" << endl;
+  cout << "=============================================" << endl;
+  cout << "ERROR: Your GPU does not support Int8 Matmul!" << endl;
+  cout << "=============================================" << endl;
+  cout << "" << endl;
  assert(false);

 	return 0;
--- a/setup.py
+++ b/setup.py
@ -18,7 +18,7 @@ def read(fname):

 setup(
    name=f"bitsandbytes",
-    version=f"0.32.1",
+    version=f"0.32.2",
    author="Tim Dettmers",
    author_email="dettmers@cs.washington.edu",
    description="8-bit optimizers and matrix multiplication routines.",
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@ -40,6 +40,7 @@ names = [
    ids=names,
 )
 def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
+    if not torch.cuda.is_available(): pytest.skip('No GPU found.')
    if dim2 > 0:
        dim2 = dim2 - (dim2 % 16)
    dim3 = dim3 - (dim3 % 16)
@ -306,6 +307,7 @@ def test_matmullt(
    has_fp16_weights,
    has_bias
 ):
+    if not torch.cuda.is_available(): pytest.skip('No GPU found.')
    dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
    dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3)
    outlier_dim = torch.randint(0, dimA[1], size=(dimA[1] // 8,), device="cuda")
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@ -1813,16 +1813,16 @@ def test_spmm_coo_dequant(dim1, dim2, dtype):


 batch_size = 1
-seqdim = 2048
+seqdim = 1
 values = []
-values.append((batch_size, seqdim, 768, 4 * 768))
+#values.append((batch_size, seqdim, 768, 4 * 768))
 # values.append((batch_size, seqdim, 1024, 4*1024))
 # values.append((batch_size, seqdim, 1536, 4*1536))
 # values.append((batch_size, seqdim, 2048, 4*2048))
 # values.append((batch_size, seqdim, 2560, 4*2560))
 # values.append((batch_size, seqdim, 4096, 4*4096))
 # values.append((batch_size, seqdim, 5140, 4*5140))
-# values.append((batch_size, seqdim, 12288, 4*12288))
+values.append((batch_size, seqdim, 12288, 4*12288))
 names = [
    "batch_{0}_seq_{1}_model_{2}_hidden_{3}".format(*vals) for vals in values
 ]
@ -1830,6 +1830,7 @@ names = [

@pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names)
 def test_bench_matmul(batch, seq, model, hidden):
+    iters = 128
    formatB = F.get_special_format_str()

    A = torch.randn(batch, seq, model, device="cuda").half()
@ -1848,28 +1849,33 @@ def test_bench_matmul(batch, seq, model, hidden):
    linearMixedBit.eval()

    # warmup
-    for i in range(100):
+    for i in range(iters):
        torch.matmul(A, B.t())
    torch.cuda.synchronize()
    print("")

    torch.cuda.synchronize()
    t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
        torch.matmul(A, B.t())
    torch.cuda.synchronize()
    print(
-        f"pytorch: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
+        f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
    )

    torch.cuda.synchronize()
    t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
        bnb.matmul(A, B)
    torch.cuda.synchronize()
-    print(
-        f"bnb lt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
-    )
+    print(f"CB -> CxB conversion (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
+
+    torch.cuda.synchronize()
+    t0 = time.time()
+    for i in range(iters):
+        bnb.matmul(A, B, threshold=6.0)
+    torch.cuda.synchronize()
+    print(f"CB -> CxB conversion + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")

    CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A, threshold=0.0)
    C32A, SA = F.transform(CA, "col32")
@ -1877,18 +1883,16 @@ def test_bench_matmul(batch, seq, model, hidden):
    CxB, SB = F.transform(CB, to_order=formatB)
    torch.cuda.synchronize()
    t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
        out32, Sout32 = F.igemmlt(C32A, CxB, SA, SB)
    torch.cuda.synchronize()
-    print(
-        f"igemmlt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
-    )
+    print(f"no overhead matmul-lt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")

    BA, statsB = F.vectorwise_quant(B, dim=1)
    CxB, SB = F.nvidia_transform(CB, to_order=formatB)
    torch.cuda.synchronize()
    t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
        A2 = A.view(-1, A.shape[-1]).contiguous()
        CA, statsA = F.vectorwise_quant(A2, dim=1)
        C32A, SA = F.nvidia_transform(CA, "col32")
@ -1896,15 +1900,13 @@ def test_bench_matmul(batch, seq, model, hidden):
        Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32)
        F.vectorwise_mm_dequant(Cout, statsA, statsB.t())
    torch.cuda.synchronize()
-    print(
-        f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
-    )
+    #print(f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")

    BA, statsB = F.vectorwise_quant(B, dim=1, quant_type="linear")
    CxB, SB = F.nvidia_transform(CB, to_order=formatB)
    torch.cuda.synchronize()
    t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
        A2 = A.view(-1, A.shape[-1]).contiguous()
        CA, statsA = F.vectorwise_quant(A2, dim=1, quant_type="linear")
        C32A, SA = F.nvidia_transform(CA, "col32")
@ -1912,14 +1914,12 @@ def test_bench_matmul(batch, seq, model, hidden):
        Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32)
        out = Cout * statsB * statsA * (1.0 / (127 * 127))
    torch.cuda.synchronize()
-    print(
-        f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
-    )
+    #print(f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")

    linear8bit(A)
    torch.cuda.synchronize()
    t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
        linear8bit(A)
    torch.cuda.synchronize()
    print(
@ -1929,7 +1929,7 @@ def test_bench_matmul(batch, seq, model, hidden):
    linearMixedBit(A)
    torch.cuda.synchronize()
    t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
        linearMixedBit(A)
    torch.cuda.synchronize()
    print(