diff --git a/Makefile b/Makefile index 059545c..ea6ee87 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH))) -#GPP:= /usr/bin/g++ -GPP:= /sw/gcc/11.2.0/bin/g++ +GPP:= /usr/bin/g++ +#GPP:= /sw/gcc/11.2.0/bin/g++ ifeq ($(CUDA_HOME),) CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev) endif @@ -26,7 +26,6 @@ FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include INCLUDE_10x := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/dependencies/cub -I $(ROOT_DIR)/include -INCLUDE_cutlass := -I $(ROOT_DIR)/dependencies/cutlass/include -I $(ROOT_DIR)/dependencies/cutlass/tools/util/include/ -I $(ROOT_DIR)/dependencies/cutlass/include/cute/util/ LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcurand -lcusparse -L $(CONDA_PREFIX)/lib # NVIDIA NVCC compilation flags @@ -63,8 +62,8 @@ CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90 all: $(BUILD_DIR) env - $(NVCC) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) + $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env @@ -102,11 +101,6 @@ cuda11x: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) -cuda11x_cutlass: $(BUILD_DIR) env cutlass - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math --expt-relaxed-constexpr -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(INCLUDE_cutlass) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++17 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(INCLUDE_cutlass) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - cuda12x: $(BUILD_DIR) env $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 7e4874a..54a08a1 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -1456,7 +1456,7 @@ def cutlass3_gemm( # [km, nk -> mn] lda = ldb = ldc = 1 #lda = 1 - print(m, n, k, lda, ldb, ldc) + #print(m, n, k, lda, ldb, ldc) is_on_gpu([B, A, out]) m = ct.c_int32(m) n = ct.c_int32(n) @@ -1466,7 +1466,7 @@ def cutlass3_gemm( ldc = ct.c_int32(ldc) alpha = ct.c_float(1.0) beta = ct.c_float(0.0) - lib.ccutlass_gemm(m, n, k, alpha, get_ptr(B), lda, get_ptr(A), ldb, beta, get_ptr(out), ldc) + lib.ccutlass_gemm(m, n, k, alpha, get_ptr(A), ldb, get_ptr(B), lda, beta, get_ptr(out), ldc) return out diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 4c83573..ed87c69 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -15,11 +15,6 @@ #include #include -#include -#include "cutlass/util/print_error.hpp" -#include "cutlass/util/GPU_Clock.hpp" -#include "cutlass/util/cublas_wrappers.hpp" - #define HLF_MAX 65504 #define TH 1024 #define NUM 4 @@ -2949,147 +2944,42 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * //// 9. write outputs to matmul output matrix //} -#include "cutlass/util/print_error.hpp" -#include "cutlass/util/GPU_Clock.hpp" -#if defined(CUTLASS_ENABLE_CUBLAS) && CUTLASS_ENABLE_CUBLAS != 0 -# include "cutlass/util/cublas_wrappers.hpp" -#endif -//#include "cutlass/util/helper_cuda.hpp" - __global__ void gemm_device(int M, int N, int K, float const* A, float const* B, float * out, int lda, int ldb, int ldc, float alpha, float beta) { - using namespace cute; - using X = Underscore; +// 0. We want to fill a 8x128 tile for a thread block so we have 8x16 tile for each warp +// 1. Load dataB into register +// 2. Dequantize B +// 3. Fetch data from A and multiply - // Preconditions - //CUTE_STATIC_ASSERT(is_static::value); - //CUTE_STATIC_ASSERT(is_static::value); - //CUTE_STATIC_ASSERT(is_static::value); + typedef cub::BlockLoad LoadA; + __shared__ typename LoadA::TempStorage loada; + float dataA[1]; + int valid_items = 0; - //CUTE_STATIC_ASSERT(is_static::value); - //CUTE_STATIC_ASSERT(is_static::value); - //CUTE_STATIC_ASSERT(is_static::value); - - //CUTE_STATIC_ASSERT_V(size(tA) == size(tC)); - //CUTE_STATIC_ASSERT_V(size(tB) == size(tC)); - - // Define block sizes (static) - auto bM = Int<128>{}; - auto bN = Int<128>{}; - auto bK = Int< 8>{}; - - // Define the block layouts (static) - auto bA = make_layout(make_shape(bM,bK)); - auto bB = make_layout(make_shape(bN,bK)); - auto bC = make_layout(make_shape(bM,bN)); - - // Define the thread layouts (static) - auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{})); - auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{})); - auto tC = make_layout(make_shape(Int<16>{}, Int<16>{})); - - //CUTE_STATIC_ASSERT_V(shape<0>(blockA) == shape<0>(blockC)); // BLK_M - //CUTE_STATIC_ASSERT_V(shape<0>(blockB) == shape<1>(blockC)); // BLK_N - //CUTE_STATIC_ASSERT_V(shape<1>(blockA) == shape<1>(blockB)); // BLK_K - - // Shared memory buffers - __shared__ float smemA[128*8]; - __shared__ float smemB[128*8]; - auto sA = make_tensor(make_smem_ptr(smemA), bA); // (BLK_M,BLK_K) - auto sB = make_tensor(make_smem_ptr(smemB), bB); // (BLK_N,BLK_K) - - auto dA = make_stride(Int<1>{}, lda); - auto dB = make_stride(Int<1>{}, ldb); - auto dC = make_stride(Int<1>{}, ldc); - - // Represent the full tensors - auto mA = make_tensor(make_gmem_ptr(A), make_shape(M,K), dA); // (M,K) - auto mB = make_tensor(make_gmem_ptr(B), make_shape(N,K), dB); // (N,K) - auto mC = make_tensor(make_gmem_ptr(out), make_shape(M,N), dC); // (M,N) - - // Get the appropriate blocks for this thread block -- - // potential for thread block locality - auto blk_shape = make_shape(size<0>(sA), size<0>(sB), size<1>(sB));// (BLK_M,BLK_N,BLK_K) - auto blk_coord = make_coord(blockIdx.x, blockIdx.y, _); // (m,n,k) - - auto gA = local_tile(mA, blk_shape, blk_coord, Step<_1, X,_1>{}); // (BLK_M,BLK_K,k) - auto gB = local_tile(mB, blk_shape, blk_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,k) - auto gC = local_tile(mC, blk_shape, blk_coord, Step<_1,_1, X>{}); // (BLK_M,BLK_N) - - // - // Partition the copying of A and B tiles across the threads - // - - // TUTORIAL: Example of simple partitioning of A|B tiles over tA|tB - // Default is a raked partition, but can be changed with Step parameter - - auto tAgA = local_partition(gA, tA, threadIdx.x); // (THR_M,THR_K,k) - auto tAsA = local_partition(sA, tA, threadIdx.x); // (THR_M,THR_K) - - auto tBgB = local_partition(gB, tB, threadIdx.x); // (THR_N,THR_K,k) - auto tBsB = local_partition(sB, tB, threadIdx.x); // (THR_N,THR_K) - - // - // Define C accumulators and A/B partitioning - // - - // TUTORIAL: Example of partitioning via projections of tC - - // Partition sA (M,K) by the rows of tC - auto tCsA = local_partition(sA, tC, threadIdx.x, Step<_1, X>{}); // (THR_M,BLK_K) - // Partition sB (N,K) by the cols of tC - auto tCsB = local_partition(sB, tC, threadIdx.x, Step< X,_1>{}); // (THR_N,BLK_K) - // Partition gC (M,N) by the tile of tC - auto tCgC = local_partition(gC, tC, threadIdx.x, Step<_1,_1>{}); // (THR_M,THR_N) - - // Allocate the accumulators -- same size as the projected data - auto tCrC = make_fragment_like(tCgC); // (THR_M,THR_N) - - // Clear the accumulators - clear(tCrC); + __shared__ float[16*256] tileA; - // TUTORIAL: Example of a very simple compute loop - // Data is read from global to shared memory via the tA|tB partitioning - // gemm(.) operates on the shared memory directly via the tC partitioning - - auto k_max = size<2>(tAgA); - - for (int k = 0; k < k_max; ++k) - { - // Copy gmem to smem - copy(tAgA(_,_,k), tAsA); - copy(tBgB(_,_,k), tBsB); - - // In case copy uses cp.async, make sure that the cp.async - // instructions are ordered with respect to other cp.async - // instructions (fence), then wait on all the outstanding copy - // operations (wait<0>()). __syncthreads() alone does not do - // this. - // - // NOTE: cp_async_wait<0>() currently issues cp.async.wait_all. - // This is equivalent to cp.async.commit_group followed by - // cp.async_wait_group 0. This should make the first - // cp_async_fence() (which also issues cp.async.commit_group) - // redundant. The tutorial works as-is, so we'll leave the - // redundant fence in for now and study its removal later. - cp_async_fence(); - cp_async_wait<0>(); - - __syncthreads(); - - // Compute gemm on smem - gemm(tCsA, tCsB, tCrC); - - __syncthreads(); - } + for(int idxA = 0; idxA < M*K; idxA+= 256) + { + valid_items = M*K - idxA > 256 ? 256 : M*K - idxA; + int baserow = 0; + for(int row = baserow; row < baserow+16 && row < M + ; row++) + { + LoadA(loada).Load(&(A[(row*lda) + i]), dataA, valid_items, 0.0f); + tileA[row*256 + threadIdx.x] = dataA[0]; + __syncthreads(); + } + baserow += 16; + + + } + - axpby(alpha, tCrC, beta, tCgC); } diff --git a/csrc/ops.cu b/csrc/ops.cu index ca56fae..8933927 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -665,9 +665,6 @@ template void extractOutliers(char * A, int *idx, char *out, int id -#include -#include "cutlass/util/helper_cuda.hpp" - void gemm_host(int m, int n, int k, float alpha, @@ -676,29 +673,14 @@ void gemm_host(int m, int n, int k, float beta, float * C, int ldc) { - cute::device_init(0); - using namespace cute; + dim3 dimBlock(256); + int num_blocks = (n+31)/32; - - // Define shapes (dynamic) - auto M = int(m); - auto N = int(n); - auto K = int(k); - - - printf("%i %i %i %i %i %i\n", m, n, k, lda, ldb, ldc); - - dim3 dimBlock(16, 16); - dim3 dimGrid((M+127)/128, (N+127)/128); -// auto tC = make_layout(make_shape(Int<16>{}, Int<16>{})); -//- -//- dim3 dimBlock(size(tC)); -//- dim3 dimGrid(ceil_div(size(M), size(bM)), -//- ceil_div(size(N), size(bN))); + cout << num_blocks << endl; gemm_device - <<< dimGrid, dimBlock, 0, 0 >>> - (M, N, K, + <<< num_blocks, dimBlock, 0, 0 >>> + (m, n, k, A, B, C, lda, ldb, ldc, diff --git a/tests/test_functional.py b/tests/test_functional.py index 128c803..dd41972 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2363,12 +2363,6 @@ def test_cutlass3_gemm(): print(B) C1 = torch.matmul(A, B) - print(C1) - C2 = F.cutlass3_gemm(A, B.t()) - print(C2) C2 = F.cutlass3_gemm(A, B) - print(C2) - C2 = F.cutlass3_gemm(B.t(), A.t().contiguous()) - print(C2)