From 0b481bfcc243cd8cc0006382f43dd24b359f6655 Mon Sep 17 00:00:00 2001 From: arlo-phoenix Date: Tue, 8 Aug 2023 18:50:26 +0000 Subject: [PATCH] Use workaround for ROCm wave32 recognition just sets __AMDGCN_WAVEFRONT_SIZE forcefully to 32. Not correct (some GPU's don't support wave32), but works on the supported GPU's. Can disable with DISABLE_WARP_32 With this blockwise quantize works and with that nf4 is supported. --- Makefile | 23 +++++++++++++++++------ bitsandbytes/functional.py | 1 + csrc/kernels.cu | 14 ++++++++------ csrc/ops.cu | 2 +- csrc/ops.cuh | 4 ++++ 5 files changed, 31 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 0aa8122..17ce761 100644 --- a/Makefile +++ b/Makefile @@ -10,9 +10,23 @@ ifeq ($(ROCM_HOME),) ROCM_HOME:= $(shell which hipcc | rev | cut -d'/' -f4- | rev) endif +ifneq ($(CUDA_HOME),) +ifndef CUDA_VERSION +$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU) +CUDA_VERSION:= +endif + +else ifneq ($(ROCM_HOME),) +ifndef ROCM_TARGET +$(error ERROR: ROCM_TARGET not set. Call make with ROCM string (see https://www.llvm.org/docs/AMDGPUUsage.html#processors), for example: make hip ROCM_TARGET=gfx1030) +ROCM_TARGET:= +endif +endif + NVCC := $(CUDA_HOME)/bin/nvcc +HIPCC:= $(ROCM_HOME)/bin/hipcc ########################################### @@ -112,14 +126,11 @@ cpuonly: $(BUILD_DIR) env $(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so -HIP_INCLUDE := -I $(ROCM_HOME)/include -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include +HIP_INCLUDE := -I $(ROCM_HOME)/include -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include HIP_LIB := -L $(ROCM_HOME)/lib -lhipblas -lhiprand -lhipsparse #-lhipblaslt #TODO: check if this is actually only gfx90a - hip: $(BUILD_DIR) - # Add --offload-arch=gfx1030 if this fails - /usr/bin/hipcc -std=c++14 -c -fPIC $(HIP_INCLUDE) -o $(BUILD_DIR)/ops.o -D NO_CUBLASLT -D BITS_AND_BYTES_USE_ROCM $(CSRC)/ops.cu - /usr/bin/hipcc -std=c++14 -c -fPIC $(HIP_INCLUDE) -o $(BUILD_DIR)/kernels.o -D NO_CUBLASLT -D BITS_AND_BYTES_USE_ROCM $(CSRC)/kernels.cu - # /usr/bin/hipcc -fPIC -static $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.so + $(HIPCC) -std=c++14 -c -fPIC --offload-arch=$(ROCM_TARGET) $(HIP_INCLUDE) -o $(BUILD_DIR)/ops.o -DNO_CUBLASLT -DBITS_AND_BYTES_USE_ROCM $(CSRC)/ops.cu + $(HIPCC) -std=c++14 -c -fPIC --offload-arch=$(ROCM_TARGET) $(HIP_INCLUDE) -o $(BUILD_DIR)/kernels.o -DNO_CUBLASLT -DBITS_AND_BYTES_USE_ROCM $(CSRC)/kernels.cu # HCC is deprecated, but used by hipBLASlt header. Since blas isn't even used doesn't matter, this is just so that it even compiles $(GPP) -std=c++14 -D__HIP_PLATFORM_HCC__ -D__HIP_PLATFORM_AMD__ -DBUILD_CUDA -DBITS_AND_BYTES_USE_ROCM -shared -fPIC $(HIP_INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(FILES_CPP) $(HIP_LIB) -o ./bitsandbytes/libbitsandbytes_hip_nohipblaslt.so diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 96f8ce4..9b0938f 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -815,6 +815,7 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz if out is None: out = torch.zeros(((n+1)//2, 1), dtype=torch.uint8, device=A.device) + #TODO: catch rocm wave64 only, pytorch has a property, but that one likely contains the wrong waveSize assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64] prev_device = pre_call(A.device) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 1cc7374..9878f2b 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -736,23 +736,26 @@ __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned c } } -template +template //__launch_bounds__(TH, 4) __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n) { - #ifdef BITS_AND_BYTES_USE_ROCM - printf("kQuantizeBlockwise is not supported on Rocm!"); + #ifdef DISABLE_WARP_32 + //CUDA has warpsize 32 so just multiply by 2 to get amd warp size + //currently just stopping below with a return anyways so this size isn't actually used, just needed for compilation + const int BLOCK_SIZE=((REQUESTED_BLOCK_SIZE / NUM_PER_TH % 64) != 0 ) ? REQUESTED_BLOCK_SIZE * 2 : REQUESTED_BLOCK_SIZE; //TODO: figure out how to make compiler recognize what isn't executed based on template arguments, without the code below in ifndef would trigger static_assert if //this condition is true - if ((BLOCK_SIZE / NUM_PER_TH % 64) != 0) + if ((REQUESTED_BLOCK_SIZE / NUM_PER_TH % 64) != 0) { printf("kQuantizeBlockwise not fully supported on Rocm! BLOCK_SIZE/NUM_PER_TH needs to be divisible by 64."); return; } + #else + const int BLOCK_SIZE=REQUESTED_BLOCK_SIZE; #endif - #ifndef BITS_AND_BYTES_USE_ROCM const int n_full = gridDim.x * BLOCK_SIZE; int valid_items = 0; const int base_idx = (blockIdx.x * BLOCK_SIZE); @@ -854,7 +857,6 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float __syncthreads(); StoreChar(storec).Store(&(out[(DATA_TYPE > 0) ? i/2 : i]), qvals, (DATA_TYPE > 0) ? (valid_items+1)/2 : valid_items); } - #endif } template diff --git a/csrc/ops.cu b/csrc/ops.cu index 0606fd3..252cc09 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -11,7 +11,7 @@ #include #ifdef BITS_AND_BYTES_USE_ROCM -//#include +#include #else #include #endif diff --git a/csrc/ops.cuh b/csrc/ops.cuh index a55ec24..55425ee 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -14,6 +14,10 @@ #ifdef BITS_AND_BYTES_USE_ROCM +#ifndef DISABLE_WARP_32 + #define __AMDGCN_WAVEFRONT_SIZE 32 // check rocminfo | grep "Wavefront Size". Should be supported on all new GPU's +#endif + #include #include #include