Added ARCH guard for bfloat16 computations.
This commit is contained in:
parent
0a1cced375
commit
1c774ecebb
|
@ -255,3 +255,6 @@ Features:
|
||||||
|
|
||||||
Bug fixes:
|
Bug fixes:
|
||||||
- Added `device` variable for bitsandbytes layers to be compatible with PyTorch layers.
|
- Added `device` variable for bitsandbytes layers to be compatible with PyTorch layers.
|
||||||
|
|
||||||
|
Deprecated:
|
||||||
|
- Binaries for CUDA 11.2, 11.3, 11.6 no longer ship with `pip install bitsandbytes` and need to be compiled from source.
|
||||||
|
|
|
@ -3312,6 +3312,7 @@ __device__ static float nf4_data[16] = {-1.0, -0.6961928009986877, -0.5250730514
|
||||||
template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize)
|
template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ >= 750
|
||||||
using namespace nvcuda;
|
using namespace nvcuda;
|
||||||
int col_offset = blockIdx.x *32;
|
int col_offset = blockIdx.x *32;
|
||||||
const int warp_id = threadIdx.x / 32;
|
const int warp_id = threadIdx.x / 32;
|
||||||
|
@ -3517,6 +3518,7 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
|
||||||
|
|
||||||
if(col_offset + warp_lane < M)
|
if(col_offset + warp_lane < M)
|
||||||
out[col_offset + warp_lane] = smem_C[warp_lane];
|
out[col_offset + warp_lane] = smem_C[warp_lane];
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#define num_values_4bit 32
|
#define num_values_4bit 32
|
||||||
|
@ -3544,7 +3546,7 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc
|
||||||
T local_absmax = T(0.0f);
|
T local_absmax = T(0.0f);
|
||||||
|
|
||||||
for(int i = threadIdx.x; i < 16; i++)
|
for(int i = threadIdx.x; i < 16; i++)
|
||||||
quant_map[i] = datatype[i];
|
quant_map[i] = T(datatype[i]);
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
@ -3577,8 +3579,14 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for(int k = 0; k < num_values_4bit; k++)
|
for(int k = 0; k < num_values_4bit; k++)
|
||||||
{
|
{
|
||||||
|
#if __CUDA_ARCH__ >= 800
|
||||||
local_B[k*2] = quant_map[local_B_4bit[k] >> 4]*local_absmax;
|
local_B[k*2] = quant_map[local_B_4bit[k] >> 4]*local_absmax;
|
||||||
local_B[k*2 + 1] = quant_map[local_B_4bit[k] & 0x0F]*local_absmax;
|
local_B[k*2 + 1] = quant_map[local_B_4bit[k] & 0x0F]*local_absmax;
|
||||||
|
#else
|
||||||
|
// bf16 multipliation not supported
|
||||||
|
local_B[k*2] = T((float)quant_map[local_B_4bit[k] >> 4]*(float)local_absmax);
|
||||||
|
local_B[k*2 + 1] = T((float)quant_map[local_B_4bit[k] & 0x0F]*(float)local_absmax);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if(inner_idx+num_values_4bit)
|
if(inner_idx+num_values_4bit)
|
||||||
|
@ -3609,7 +3617,14 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for(int k = 0; k < num_values_4bit; k++)
|
for(int k = 0; k < num_values_4bit; k++)
|
||||||
|
{
|
||||||
|
#if __CUDA_ARCH__ >= 800
|
||||||
local_C += (float)(local_A[k]*local_B[k]);
|
local_C += (float)(local_A[k]*local_B[k]);
|
||||||
|
#else
|
||||||
|
// bf16 multipliation not supported
|
||||||
|
local_C += ((float)local_A[k]*(float)local_B[k]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
214
deploy.sh
214
deploy.sh
|
@ -10,103 +10,73 @@ if [[ ! -z "${LD_LIBRARY_PATH}" ]]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
module unload cuda && echo "no module function available. Probably not on a slurm cluster."
|
#module unload cuda && echo "no module function available. Probably not on a slurm cluster."
|
||||||
module unload gcc && echo "no module function available. Probably not on a slurm cluster."
|
#module unload gcc && echo "no module function available. Probably not on a slurm cluster."
|
||||||
|
#
|
||||||
rm -rf dist build
|
#rm -rf dist build
|
||||||
make cleaneggs
|
#make cleaneggs
|
||||||
make cleanlibs
|
#make cleanlibs
|
||||||
|
#
|
||||||
make clean
|
#make clean
|
||||||
export CUDA_HOME=
|
#export CUDA_HOME=
|
||||||
export CUDA_VERSION=
|
#export CUDA_VERSION=
|
||||||
make cpuonly CUDA_VERSION="CPU"
|
#make cpuonly CUDA_VERSION="CPU"
|
||||||
|
#
|
||||||
if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then
|
#if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then
|
||||||
# Control will enter here if $DIRECTORY doesn't exist.
|
# # Control will enter here if $DIRECTORY doesn't exist.
|
||||||
echo "Compilation unsuccessul!" 1>&2
|
# echo "Compilation unsuccessul!" 1>&2
|
||||||
exit 64
|
# exit 64
|
||||||
fi
|
#fi
|
||||||
|
#
|
||||||
make clean
|
#make clean
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.0
|
#export CUDA_HOME=$BASE_PATH/cuda-11.0
|
||||||
make cuda110 CUDA_VERSION=110
|
#make cuda110 CUDA_VERSION=110
|
||||||
|
#
|
||||||
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then
|
#if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then
|
||||||
# Control will enter here if $DIRECTORY doesn't exist.
|
# # Control will enter here if $DIRECTORY doesn't exist.
|
||||||
echo "Compilation unsuccessul!" 1>&2
|
# echo "Compilation unsuccessul!" 1>&2
|
||||||
exit 64
|
# exit 64
|
||||||
fi
|
#fi
|
||||||
|
#
|
||||||
make clean
|
#make clean
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.1
|
#export CUDA_HOME=$BASE_PATH/cuda-11.1
|
||||||
make cuda11x CUDA_VERSION=111
|
#make cuda11x CUDA_VERSION=111
|
||||||
|
#
|
||||||
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then
|
#if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then
|
||||||
# Control will enter here if $DIRECTORY doesn't exist.
|
# # Control will enter here if $DIRECTORY doesn't exist.
|
||||||
echo "Compilation unsuccessul!" 1>&2
|
# echo "Compilation unsuccessul!" 1>&2
|
||||||
exit 64
|
# exit 64
|
||||||
fi
|
#fi
|
||||||
|
#
|
||||||
make clean
|
#make clean
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.2
|
#export CUDA_HOME=$BASE_PATH/cuda-11.4
|
||||||
make cuda11x CUDA_VERSION=112
|
#make cuda11x CUDA_VERSION=114
|
||||||
|
#
|
||||||
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda112.so" ]; then
|
#if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then
|
||||||
# Control will enter here if $DIRECTORY doesn't exist.
|
# # Control will enter here if $DIRECTORY doesn't exist.
|
||||||
echo "Compilation unsuccessul!" 1>&2
|
# echo "Compilation unsuccessul!" 1>&2
|
||||||
exit 64
|
# exit 64
|
||||||
fi
|
#fi
|
||||||
|
#
|
||||||
make clean
|
#make clean
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.3
|
#export CUDA_HOME=$BASE_PATH/cuda-11.5
|
||||||
make cuda11x CUDA_VERSION=113
|
#make cuda11x CUDA_VERSION=115
|
||||||
|
#
|
||||||
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda113.so" ]; then
|
#if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then
|
||||||
# Control will enter here if $DIRECTORY doesn't exist.
|
# # Control will enter here if $DIRECTORY doesn't exist.
|
||||||
echo "Compilation unsuccessul!" 1>&2
|
# echo "Compilation unsuccessul!" 1>&2
|
||||||
exit 64
|
# exit 64
|
||||||
fi
|
#fi
|
||||||
|
#
|
||||||
make clean
|
#make clean
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.4
|
#export CUDA_HOME=$BASE_PATH/cuda-11.7
|
||||||
make cuda11x CUDA_VERSION=114
|
#make cuda11x CUDA_VERSION=117
|
||||||
|
#
|
||||||
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then
|
#if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then
|
||||||
# Control will enter here if $DIRECTORY doesn't exist.
|
# # Control will enter here if $DIRECTORY doesn't exist.
|
||||||
echo "Compilation unsuccessul!" 1>&2
|
# echo "Compilation unsuccessul!" 1>&2
|
||||||
exit 64
|
# exit 64
|
||||||
fi
|
#fi
|
||||||
|
|
||||||
make clean
|
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.5
|
|
||||||
make cuda11x CUDA_VERSION=115
|
|
||||||
|
|
||||||
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then
|
|
||||||
# Control will enter here if $DIRECTORY doesn't exist.
|
|
||||||
echo "Compilation unsuccessul!" 1>&2
|
|
||||||
exit 64
|
|
||||||
fi
|
|
||||||
|
|
||||||
make clean
|
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.6
|
|
||||||
|
|
||||||
make cuda11x CUDA_VERSION=116
|
|
||||||
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda116.so" ]; then
|
|
||||||
# Control will enter here if $DIRECTORY doesn't exist.
|
|
||||||
echo "Compilation unsuccessul!" 1>&2
|
|
||||||
exit 64
|
|
||||||
fi
|
|
||||||
|
|
||||||
make clean
|
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.7
|
|
||||||
make cuda11x CUDA_VERSION=117
|
|
||||||
|
|
||||||
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then
|
|
||||||
# Control will enter here if $DIRECTORY doesn't exist.
|
|
||||||
echo "Compilation unsuccessul!" 1>&2
|
|
||||||
exit 64
|
|
||||||
fi
|
|
||||||
|
|
||||||
make clean
|
make clean
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.8
|
export CUDA_HOME=$BASE_PATH/cuda-11.8
|
||||||
|
@ -138,6 +108,16 @@ if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
|
||||||
exit 64
|
exit 64
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
make clean
|
||||||
|
export CUDA_HOME=$BASE_PATH/cuda-12.2
|
||||||
|
make cuda12x CUDA_VERSION=122
|
||||||
|
|
||||||
|
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122.so" ]; then
|
||||||
|
# Control will enter here if $DIRECTORY doesn't exist.
|
||||||
|
echo "Compilation unsuccessul!" 1>&2
|
||||||
|
exit 64
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
make clean
|
make clean
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.0
|
export CUDA_HOME=$BASE_PATH/cuda-11.0
|
||||||
|
@ -160,26 +140,6 @@ if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111_nocublaslt.so" ]; then
|
||||||
exit 64
|
exit 64
|
||||||
fi
|
fi
|
||||||
|
|
||||||
make clean
|
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.2
|
|
||||||
make cuda11x_nomatmul CUDA_VERSION=112
|
|
||||||
|
|
||||||
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda112_nocublaslt.so" ]; then
|
|
||||||
# Control will enter here if $DIRECTORY doesn't exist.
|
|
||||||
echo "Compilation unsuccessul!" 1>&2
|
|
||||||
exit 64
|
|
||||||
fi
|
|
||||||
|
|
||||||
make clean
|
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.3
|
|
||||||
make cuda11x_nomatmul CUDA_VERSION=113
|
|
||||||
|
|
||||||
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda113_nocublaslt.so" ]; then
|
|
||||||
# Control will enter here if $DIRECTORY doesn't exist.
|
|
||||||
echo "Compilation unsuccessul!" 1>&2
|
|
||||||
exit 64
|
|
||||||
fi
|
|
||||||
|
|
||||||
make clean
|
make clean
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.4
|
export CUDA_HOME=$BASE_PATH/cuda-11.4
|
||||||
make cuda11x_nomatmul CUDA_VERSION=114
|
make cuda11x_nomatmul CUDA_VERSION=114
|
||||||
|
@ -200,16 +160,6 @@ if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115_nocublaslt.so" ]; then
|
||||||
exit 64
|
exit 64
|
||||||
fi
|
fi
|
||||||
|
|
||||||
make clean
|
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.6
|
|
||||||
|
|
||||||
make cuda11x_nomatmul CUDA_VERSION=116
|
|
||||||
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda116_nocublaslt.so" ]; then
|
|
||||||
# Control will enter here if $DIRECTORY doesn't exist.
|
|
||||||
echo "Compilation unsuccessul!" 1>&2
|
|
||||||
exit 64
|
|
||||||
fi
|
|
||||||
|
|
||||||
make clean
|
make clean
|
||||||
export CUDA_HOME=$BASE_PATH/cuda-11.7
|
export CUDA_HOME=$BASE_PATH/cuda-11.7
|
||||||
make cuda11x_nomatmul CUDA_VERSION=117
|
make cuda11x_nomatmul CUDA_VERSION=117
|
||||||
|
@ -250,5 +200,15 @@ if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so" ]; then
|
||||||
exit 64
|
exit 64
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
make clean
|
||||||
|
export CUDA_HOME=$BASE_PATH/cuda-12.2
|
||||||
|
make cuda12x_nomatmul CUDA_VERSION=122
|
||||||
|
|
||||||
|
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so" ]; then
|
||||||
|
# Control will enter here if $DIRECTORY doesn't exist.
|
||||||
|
echo "Compilation unsuccessul!" 1>&2
|
||||||
|
exit 64
|
||||||
|
fi
|
||||||
|
|
||||||
python -m build
|
python -m build
|
||||||
python -m twine upload dist/* --verbose
|
python -m twine upload dist/* --verbose
|
||||||
|
|
Loading…
Reference in New Issue
Block a user