From 0f5c3948709ae70cf733cefbd831aaea8a4e38c9 Mon Sep 17 00:00:00 2001
From: Tim Dettmers <tim.dettmers@gmail.com>
Date: Wed, 1 Feb 2023 20:27:01 -0800
Subject: [PATCH] Added version 0.37.0.

---
 CHANGELOG.md                    | 12 ++++++++++++
 bitsandbytes/cuda_setup/main.py | 13 +++++++------
 setup.py                        |  2 +-
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 77703a0..ac239de 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -189,3 +189,15 @@ Improvements:
  - StableEmbedding layer now has device and dtype parameters to make it 1:1 replaceable with regular Embedding layers (@lostmsu)
  - runtime performance of block-wise quantization slightly improved
  - added error message for the case multiple libcudart.so are installed and bitsandbytes picks the wrong one
+
+
+### 0.37.0
+
+#### Int8 Matmul + backward support for all GPUs
+
+Features:
+ - Int8 MatmulLt now supports backward through inversion of the ColTuring/ColAmpere format. Slow, but memory efficient. Big thanks to @borzunov
+ - Int8 now supported on all GPUs. On devices with compute capability < 7.5, the Int weights are cast to 16/32-bit for the matrix multiplication. Contributed by @borzunov
+
+Improvements:
+ - Improved logging for the CUDA detection mechanism.
diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index ce44d97..cd9573f 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -80,9 +80,10 @@ class CUDASetup:
         self.add_log_entry('python setup.py install')
 
     def initialize(self):
-        self.has_printed = False
-        self.lib = None
-        self.initialized = False
+        if not getattr(self, 'initialized', False):
+            self.has_printed = False
+            self.lib = None
+            self.initialized = False
 
     def run_cuda_setup(self):
         self.initialized = True
@@ -103,7 +104,7 @@ class CUDASetup:
                 legacy_binary_name = "libbitsandbytes_cpu.so"
                 self.add_log_entry(f"CUDA SETUP: Defaulting to {legacy_binary_name}...")
                 binary_path = package_dir / legacy_binary_name
-                if not binary_path.exists():
+                if not binary_path.exists() or torch.cuda.is_available():
                     self.add_log_entry('')
                     self.add_log_entry('='*48 + 'ERROR' + '='*37)
                     self.add_log_entry('CUDA SETUP: CUDA detection failed! Possible reasons:')
@@ -112,6 +113,7 @@ class CUDASetup:
                     self.add_log_entry('3. You have multiple conflicting CUDA libraries')
                     self.add_log_entry('4. Required library not pre-compiled for this bitsandbytes release!')
                     self.add_log_entry('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`.')
+                    self.add_log_entry('CUDA SETUP: The CUDA version for the compile might depend on your conda install. Inspect CUDA version via `conda list | grep cuda`.')
                     self.add_log_entry('='*80)
                     self.add_log_entry('')
                     self.generate_instructions()
@@ -148,7 +150,7 @@ def is_cublasLt_compatible(cc):
     if cc is not None:
         cc_major, cc_minor = cc.split('.')
         if int(cc_major) < 7 or (int(cc_major) == 7 and int(cc_minor) < 5):
-            cuda_setup.add_log_entry("WARNING: Compute capability < 7.5 detected! Proceeding to load CPU-only library...", is_warning=True)
+            cuda_setup.add_log_entry("WARNING: Compute capability < 7.5 detected! Only slow 8-bit matmul is supported for your GPU!", is_warning=True)
         else:
             has_cublaslt = True
     return has_cublaslt
@@ -362,7 +364,6 @@ def evaluate_cuda_setup():
         print('')
         print('='*35 + 'BUG REPORT' + '='*35)
         print('Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues')
-        print('For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link')
         print('='*80)
     if not torch.cuda.is_available(): return 'libsbitsandbytes_cpu.so', None, None, None, None
 
diff --git a/setup.py b/setup.py
index 93df40e..e3f453e 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ def read(fname):
 
 setup(
     name=f"bitsandbytes",
-    version=f"0.36.0-2",
+    version=f"0.37.0",
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="8-bit optimizers and matrix multiplication routines.",