diff --git a/CHANGELOG.md b/CHANGELOG.md index 387af03..e35e04d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -255,3 +255,6 @@ Features: Bug fixes: - Added `device` variable for bitsandbytes layers to be compatible with PyTorch layers. + +Deprecated: + - Binaries for CUDA 11.2, 11.3, 11.6 no longer ship with `pip install bitsandbytes` and need to be compiled from source. diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 4b05672..01caf89 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3312,6 +3312,7 @@ __device__ static float nf4_data[16] = {-1.0, -0.6961928009986877, -0.5250730514 template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize) { +#if __CUDA_ARCH__ >= 750 using namespace nvcuda; int col_offset = blockIdx.x *32; const int warp_id = threadIdx.x / 32; @@ -3517,6 +3518,7 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i if(col_offset + warp_lane < M) out[col_offset + warp_lane] = smem_C[warp_lane]; +#endif } #define num_values_4bit 32 @@ -3544,7 +3546,7 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc T local_absmax = T(0.0f); for(int i = threadIdx.x; i < 16; i++) - quant_map[i] = datatype[i]; + quant_map[i] = T(datatype[i]); __syncthreads(); @@ -3577,8 +3579,14 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc #pragma unroll for(int k = 0; k < num_values_4bit; k++) { - local_B[k*2] = quant_map[local_B_4bit[k] >> 4]*local_absmax; - local_B[k*2 + 1] = quant_map[local_B_4bit[k] & 0x0F]*local_absmax; + #if __CUDA_ARCH__ >= 800 + local_B[k*2] = quant_map[local_B_4bit[k] >> 4]*local_absmax; + local_B[k*2 + 1] = quant_map[local_B_4bit[k] & 0x0F]*local_absmax; + #else + // bf16 multipliation not supported + local_B[k*2] = T((float)quant_map[local_B_4bit[k] >> 4]*(float)local_absmax); + local_B[k*2 + 1] = T((float)quant_map[local_B_4bit[k] & 0x0F]*(float)local_absmax); + #endif } if(inner_idx+num_values_4bit) @@ -3609,7 +3617,14 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc #pragma unroll for(int k = 0; k < num_values_4bit; k++) - local_C += (float)(local_A[k]*local_B[k]); + { + #if __CUDA_ARCH__ >= 800 + local_C += (float)(local_A[k]*local_B[k]); + #else + // bf16 multipliation not supported + local_C += ((float)local_A[k]*(float)local_B[k]); + #endif + } } diff --git a/deploy.sh b/deploy.sh index a2257a2..bb11fd6 100644 --- a/deploy.sh +++ b/deploy.sh @@ -10,103 +10,73 @@ if [[ ! -z "${LD_LIBRARY_PATH}" ]]; then fi -module unload cuda && echo "no module function available. Probably not on a slurm cluster." -module unload gcc && echo "no module function available. Probably not on a slurm cluster." - -rm -rf dist build -make cleaneggs -make cleanlibs - -make clean -export CUDA_HOME= -export CUDA_VERSION= -make cpuonly CUDA_VERSION="CPU" - -if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then - # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 - exit 64 -fi - -make clean -export CUDA_HOME=$BASE_PATH/cuda-11.0 -make cuda110 CUDA_VERSION=110 - -if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then - # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 - exit 64 -fi - -make clean -export CUDA_HOME=$BASE_PATH/cuda-11.1 -make cuda11x CUDA_VERSION=111 - -if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then - # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 - exit 64 -fi - -make clean -export CUDA_HOME=$BASE_PATH/cuda-11.2 -make cuda11x CUDA_VERSION=112 - -if [ ! -f "./bitsandbytes/libbitsandbytes_cuda112.so" ]; then - # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 - exit 64 -fi - -make clean -export CUDA_HOME=$BASE_PATH/cuda-11.3 -make cuda11x CUDA_VERSION=113 - -if [ ! -f "./bitsandbytes/libbitsandbytes_cuda113.so" ]; then - # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 - exit 64 -fi - -make clean -export CUDA_HOME=$BASE_PATH/cuda-11.4 -make cuda11x CUDA_VERSION=114 - -if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then - # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 - exit 64 -fi - -make clean -export CUDA_HOME=$BASE_PATH/cuda-11.5 -make cuda11x CUDA_VERSION=115 - -if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then - # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 - exit 64 -fi - -make clean -export CUDA_HOME=$BASE_PATH/cuda-11.6 - -make cuda11x CUDA_VERSION=116 -if [ ! -f "./bitsandbytes/libbitsandbytes_cuda116.so" ]; then - # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 - exit 64 -fi - -make clean -export CUDA_HOME=$BASE_PATH/cuda-11.7 -make cuda11x CUDA_VERSION=117 - -if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then - # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 - exit 64 -fi +#module unload cuda && echo "no module function available. Probably not on a slurm cluster." +#module unload gcc && echo "no module function available. Probably not on a slurm cluster." +# +#rm -rf dist build +#make cleaneggs +#make cleanlibs +# +#make clean +#export CUDA_HOME= +#export CUDA_VERSION= +#make cpuonly CUDA_VERSION="CPU" +# +#if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then +# # Control will enter here if $DIRECTORY doesn't exist. +# echo "Compilation unsuccessul!" 1>&2 +# exit 64 +#fi +# +#make clean +#export CUDA_HOME=$BASE_PATH/cuda-11.0 +#make cuda110 CUDA_VERSION=110 +# +#if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then +# # Control will enter here if $DIRECTORY doesn't exist. +# echo "Compilation unsuccessul!" 1>&2 +# exit 64 +#fi +# +#make clean +#export CUDA_HOME=$BASE_PATH/cuda-11.1 +#make cuda11x CUDA_VERSION=111 +# +#if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then +# # Control will enter here if $DIRECTORY doesn't exist. +# echo "Compilation unsuccessul!" 1>&2 +# exit 64 +#fi +# +#make clean +#export CUDA_HOME=$BASE_PATH/cuda-11.4 +#make cuda11x CUDA_VERSION=114 +# +#if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then +# # Control will enter here if $DIRECTORY doesn't exist. +# echo "Compilation unsuccessul!" 1>&2 +# exit 64 +#fi +# +#make clean +#export CUDA_HOME=$BASE_PATH/cuda-11.5 +#make cuda11x CUDA_VERSION=115 +# +#if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then +# # Control will enter here if $DIRECTORY doesn't exist. +# echo "Compilation unsuccessul!" 1>&2 +# exit 64 +#fi +# +#make clean +#export CUDA_HOME=$BASE_PATH/cuda-11.7 +#make cuda11x CUDA_VERSION=117 +# +#if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then +# # Control will enter here if $DIRECTORY doesn't exist. +# echo "Compilation unsuccessul!" 1>&2 +# exit 64 +#fi make clean export CUDA_HOME=$BASE_PATH/cuda-11.8 @@ -138,6 +108,16 @@ if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then exit 64 fi +make clean +export CUDA_HOME=$BASE_PATH/cuda-12.2 +make cuda12x CUDA_VERSION=122 + +if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122.so" ]; then + # Control will enter here if $DIRECTORY doesn't exist. + echo "Compilation unsuccessul!" 1>&2 + exit 64 +fi + make clean export CUDA_HOME=$BASE_PATH/cuda-11.0 @@ -160,26 +140,6 @@ if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111_nocublaslt.so" ]; then exit 64 fi -make clean -export CUDA_HOME=$BASE_PATH/cuda-11.2 -make cuda11x_nomatmul CUDA_VERSION=112 - -if [ ! -f "./bitsandbytes/libbitsandbytes_cuda112_nocublaslt.so" ]; then - # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 - exit 64 -fi - -make clean -export CUDA_HOME=$BASE_PATH/cuda-11.3 -make cuda11x_nomatmul CUDA_VERSION=113 - -if [ ! -f "./bitsandbytes/libbitsandbytes_cuda113_nocublaslt.so" ]; then - # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 - exit 64 -fi - make clean export CUDA_HOME=$BASE_PATH/cuda-11.4 make cuda11x_nomatmul CUDA_VERSION=114 @@ -200,16 +160,6 @@ if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115_nocublaslt.so" ]; then exit 64 fi -make clean -export CUDA_HOME=$BASE_PATH/cuda-11.6 - -make cuda11x_nomatmul CUDA_VERSION=116 -if [ ! -f "./bitsandbytes/libbitsandbytes_cuda116_nocublaslt.so" ]; then - # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 - exit 64 -fi - make clean export CUDA_HOME=$BASE_PATH/cuda-11.7 make cuda11x_nomatmul CUDA_VERSION=117 @@ -250,5 +200,15 @@ if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so" ]; then exit 64 fi +make clean +export CUDA_HOME=$BASE_PATH/cuda-12.2 +make cuda12x_nomatmul CUDA_VERSION=122 + +if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so" ]; then + # Control will enter here if $DIRECTORY doesn't exist. + echo "Compilation unsuccessul!" 1>&2 + exit 64 +fi + python -m build python -m twine upload dist/* --verbose