Added non-cutlass template.
This commit is contained in:
parent
0afc8e9e2f
commit
d1c4c20568
14
Makefile
14
Makefile
|
@ -1,8 +1,8 @@
|
|||
MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
|
||||
ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH)))
|
||||
|
||||
#GPP:= /usr/bin/g++
|
||||
GPP:= /sw/gcc/11.2.0/bin/g++
|
||||
GPP:= /usr/bin/g++
|
||||
#GPP:= /sw/gcc/11.2.0/bin/g++
|
||||
ifeq ($(CUDA_HOME),)
|
||||
CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev)
|
||||
endif
|
||||
|
@ -26,7 +26,6 @@ FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c
|
|||
|
||||
INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
|
||||
INCLUDE_10x := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/dependencies/cub -I $(ROOT_DIR)/include
|
||||
INCLUDE_cutlass := -I $(ROOT_DIR)/dependencies/cutlass/include -I $(ROOT_DIR)/dependencies/cutlass/tools/util/include/ -I $(ROOT_DIR)/dependencies/cutlass/include/cute/util/
|
||||
LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcurand -lcusparse -L $(CONDA_PREFIX)/lib
|
||||
|
||||
# NVIDIA NVCC compilation flags
|
||||
|
@ -63,8 +62,8 @@ CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90
|
|||
|
||||
|
||||
all: $(BUILD_DIR) env
|
||||
$(NVCC) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
|
||||
$(NVCC) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
|
||||
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
|
||||
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
|
||||
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
|
||||
|
||||
cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
|
||||
|
@ -102,11 +101,6 @@ cuda11x: $(BUILD_DIR) env
|
|||
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
|
||||
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
|
||||
|
||||
cuda11x_cutlass: $(BUILD_DIR) env cutlass
|
||||
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math --expt-relaxed-constexpr -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(INCLUDE_cutlass) $(LIB) --output-directory $(BUILD_DIR)
|
||||
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
|
||||
$(GPP) -std=c++17 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(INCLUDE_cutlass) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
|
||||
|
||||
cuda12x: $(BUILD_DIR) env
|
||||
$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
|
||||
$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
|
||||
|
|
|
@ -1456,7 +1456,7 @@ def cutlass3_gemm(
|
|||
# [km, nk -> mn]
|
||||
lda = ldb = ldc = 1
|
||||
#lda = 1
|
||||
print(m, n, k, lda, ldb, ldc)
|
||||
#print(m, n, k, lda, ldb, ldc)
|
||||
is_on_gpu([B, A, out])
|
||||
m = ct.c_int32(m)
|
||||
n = ct.c_int32(n)
|
||||
|
@ -1466,7 +1466,7 @@ def cutlass3_gemm(
|
|||
ldc = ct.c_int32(ldc)
|
||||
alpha = ct.c_float(1.0)
|
||||
beta = ct.c_float(0.0)
|
||||
lib.ccutlass_gemm(m, n, k, alpha, get_ptr(B), lda, get_ptr(A), ldb, beta, get_ptr(out), ldc)
|
||||
lib.ccutlass_gemm(m, n, k, alpha, get_ptr(A), ldb, get_ptr(B), lda, beta, get_ptr(out), ldc)
|
||||
|
||||
return out
|
||||
|
||||
|
|
158
csrc/kernels.cu
158
csrc/kernels.cu
|
@ -15,11 +15,6 @@
|
|||
#include <thrust/host_vector.h>
|
||||
#include <thrust/device_vector.h>
|
||||
|
||||
#include <cute/tensor.hpp>
|
||||
#include "cutlass/util/print_error.hpp"
|
||||
#include "cutlass/util/GPU_Clock.hpp"
|
||||
#include "cutlass/util/cublas_wrappers.hpp"
|
||||
|
||||
#define HLF_MAX 65504
|
||||
#define TH 1024
|
||||
#define NUM 4
|
||||
|
@ -2949,147 +2944,42 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
|
|||
//// 9. write outputs to matmul output matrix
|
||||
//}
|
||||
|
||||
#include "cutlass/util/print_error.hpp"
|
||||
#include "cutlass/util/GPU_Clock.hpp"
|
||||
#if defined(CUTLASS_ENABLE_CUBLAS) && CUTLASS_ENABLE_CUBLAS != 0
|
||||
# include "cutlass/util/cublas_wrappers.hpp"
|
||||
#endif
|
||||
//#include "cutlass/util/helper_cuda.hpp"
|
||||
|
||||
__global__ void gemm_device(int M, int N, int K,
|
||||
float const* A,
|
||||
float const* B,
|
||||
float * out, int lda, int ldb, int ldc,
|
||||
float alpha, float beta)
|
||||
{
|
||||
using namespace cute;
|
||||
using X = Underscore;
|
||||
// 0. We want to fill a 8x128 tile for a thread block so we have 8x16 tile for each warp
|
||||
// 1. Load dataB into register
|
||||
// 2. Dequantize B
|
||||
// 3. Fetch data from A and multiply
|
||||
|
||||
// Preconditions
|
||||
//CUTE_STATIC_ASSERT(is_static<ABlockLayout>::value);
|
||||
//CUTE_STATIC_ASSERT(is_static<BBlockLayout>::value);
|
||||
//CUTE_STATIC_ASSERT(is_static<CBlockLayout>::value);
|
||||
typedef cub::BlockLoad<float, 256 , 1, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadA;
|
||||
__shared__ typename LoadA::TempStorage loada;
|
||||
float dataA[1];
|
||||
int valid_items = 0;
|
||||
|
||||
//CUTE_STATIC_ASSERT(is_static<AThreadLayout>::value);
|
||||
//CUTE_STATIC_ASSERT(is_static<BThreadLayout>::value);
|
||||
//CUTE_STATIC_ASSERT(is_static<CThreadLayout>::value);
|
||||
|
||||
//CUTE_STATIC_ASSERT_V(size(tA) == size(tC));
|
||||
//CUTE_STATIC_ASSERT_V(size(tB) == size(tC));
|
||||
|
||||
// Define block sizes (static)
|
||||
auto bM = Int<128>{};
|
||||
auto bN = Int<128>{};
|
||||
auto bK = Int< 8>{};
|
||||
|
||||
// Define the block layouts (static)
|
||||
auto bA = make_layout(make_shape(bM,bK));
|
||||
auto bB = make_layout(make_shape(bN,bK));
|
||||
auto bC = make_layout(make_shape(bM,bN));
|
||||
|
||||
// Define the thread layouts (static)
|
||||
auto tA = make_layout(make_shape(Int<32>{}, Int< 8>{}));
|
||||
auto tB = make_layout(make_shape(Int<32>{}, Int< 8>{}));
|
||||
auto tC = make_layout(make_shape(Int<16>{}, Int<16>{}));
|
||||
|
||||
//CUTE_STATIC_ASSERT_V(shape<0>(blockA) == shape<0>(blockC)); // BLK_M
|
||||
//CUTE_STATIC_ASSERT_V(shape<0>(blockB) == shape<1>(blockC)); // BLK_N
|
||||
//CUTE_STATIC_ASSERT_V(shape<1>(blockA) == shape<1>(blockB)); // BLK_K
|
||||
|
||||
// Shared memory buffers
|
||||
__shared__ float smemA[128*8];
|
||||
__shared__ float smemB[128*8];
|
||||
auto sA = make_tensor(make_smem_ptr(smemA), bA); // (BLK_M,BLK_K)
|
||||
auto sB = make_tensor(make_smem_ptr(smemB), bB); // (BLK_N,BLK_K)
|
||||
|
||||
auto dA = make_stride(Int<1>{}, lda);
|
||||
auto dB = make_stride(Int<1>{}, ldb);
|
||||
auto dC = make_stride(Int<1>{}, ldc);
|
||||
|
||||
// Represent the full tensors
|
||||
auto mA = make_tensor(make_gmem_ptr(A), make_shape(M,K), dA); // (M,K)
|
||||
auto mB = make_tensor(make_gmem_ptr(B), make_shape(N,K), dB); // (N,K)
|
||||
auto mC = make_tensor(make_gmem_ptr(out), make_shape(M,N), dC); // (M,N)
|
||||
|
||||
// Get the appropriate blocks for this thread block --
|
||||
// potential for thread block locality
|
||||
auto blk_shape = make_shape(size<0>(sA), size<0>(sB), size<1>(sB));// (BLK_M,BLK_N,BLK_K)
|
||||
auto blk_coord = make_coord(blockIdx.x, blockIdx.y, _); // (m,n,k)
|
||||
|
||||
auto gA = local_tile(mA, blk_shape, blk_coord, Step<_1, X,_1>{}); // (BLK_M,BLK_K,k)
|
||||
auto gB = local_tile(mB, blk_shape, blk_coord, Step< X,_1,_1>{}); // (BLK_N,BLK_K,k)
|
||||
auto gC = local_tile(mC, blk_shape, blk_coord, Step<_1,_1, X>{}); // (BLK_M,BLK_N)
|
||||
|
||||
//
|
||||
// Partition the copying of A and B tiles across the threads
|
||||
//
|
||||
|
||||
// TUTORIAL: Example of simple partitioning of A|B tiles over tA|tB
|
||||
// Default is a raked partition, but can be changed with Step<X,Y> parameter
|
||||
|
||||
auto tAgA = local_partition(gA, tA, threadIdx.x); // (THR_M,THR_K,k)
|
||||
auto tAsA = local_partition(sA, tA, threadIdx.x); // (THR_M,THR_K)
|
||||
|
||||
auto tBgB = local_partition(gB, tB, threadIdx.x); // (THR_N,THR_K,k)
|
||||
auto tBsB = local_partition(sB, tB, threadIdx.x); // (THR_N,THR_K)
|
||||
|
||||
//
|
||||
// Define C accumulators and A/B partitioning
|
||||
//
|
||||
|
||||
// TUTORIAL: Example of partitioning via projections of tC
|
||||
|
||||
// Partition sA (M,K) by the rows of tC
|
||||
auto tCsA = local_partition(sA, tC, threadIdx.x, Step<_1, X>{}); // (THR_M,BLK_K)
|
||||
// Partition sB (N,K) by the cols of tC
|
||||
auto tCsB = local_partition(sB, tC, threadIdx.x, Step< X,_1>{}); // (THR_N,BLK_K)
|
||||
// Partition gC (M,N) by the tile of tC
|
||||
auto tCgC = local_partition(gC, tC, threadIdx.x, Step<_1,_1>{}); // (THR_M,THR_N)
|
||||
|
||||
// Allocate the accumulators -- same size as the projected data
|
||||
auto tCrC = make_fragment_like(tCgC); // (THR_M,THR_N)
|
||||
|
||||
// Clear the accumulators
|
||||
clear(tCrC);
|
||||
__shared__ float[16*256] tileA;
|
||||
|
||||
|
||||
// TUTORIAL: Example of a very simple compute loop
|
||||
// Data is read from global to shared memory via the tA|tB partitioning
|
||||
// gemm(.) operates on the shared memory directly via the tC partitioning
|
||||
|
||||
auto k_max = size<2>(tAgA);
|
||||
|
||||
for (int k = 0; k < k_max; ++k)
|
||||
{
|
||||
// Copy gmem to smem
|
||||
copy(tAgA(_,_,k), tAsA);
|
||||
copy(tBgB(_,_,k), tBsB);
|
||||
|
||||
// In case copy uses cp.async, make sure that the cp.async
|
||||
// instructions are ordered with respect to other cp.async
|
||||
// instructions (fence), then wait on all the outstanding copy
|
||||
// operations (wait<0>()). __syncthreads() alone does not do
|
||||
// this.
|
||||
//
|
||||
// NOTE: cp_async_wait<0>() currently issues cp.async.wait_all.
|
||||
// This is equivalent to cp.async.commit_group followed by
|
||||
// cp.async_wait_group 0. This should make the first
|
||||
// cp_async_fence() (which also issues cp.async.commit_group)
|
||||
// redundant. The tutorial works as-is, so we'll leave the
|
||||
// redundant fence in for now and study its removal later.
|
||||
cp_async_fence();
|
||||
cp_async_wait<0>();
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Compute gemm on smem
|
||||
gemm(tCsA, tCsB, tCrC);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
for(int idxA = 0; idxA < M*K; idxA+= 256)
|
||||
{
|
||||
valid_items = M*K - idxA > 256 ? 256 : M*K - idxA;
|
||||
int baserow = 0;
|
||||
for(int row = baserow; row < baserow+16 && row < M + ; row++)
|
||||
{
|
||||
LoadA(loada).Load(&(A[(row*lda) + i]), dataA, valid_items, 0.0f);
|
||||
tileA[row*256 + threadIdx.x] = dataA[0];
|
||||
__syncthreads();
|
||||
}
|
||||
baserow += 16;
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
axpby(alpha, tCrC, beta, tCgC);
|
||||
}
|
||||
|
||||
|
||||
|
|
28
csrc/ops.cu
28
csrc/ops.cu
|
@ -665,9 +665,6 @@ template <int FORMAT> void extractOutliers(char * A, int *idx, char *out, int id
|
|||
|
||||
|
||||
|
||||
#include <cute/tensor.hpp>
|
||||
#include "cutlass/util/helper_cuda.hpp"
|
||||
|
||||
|
||||
void gemm_host(int m, int n, int k,
|
||||
float alpha,
|
||||
|
@ -676,29 +673,14 @@ void gemm_host(int m, int n, int k,
|
|||
float beta,
|
||||
float * C, int ldc)
|
||||
{
|
||||
cute::device_init(0);
|
||||
using namespace cute;
|
||||
|
||||
dim3 dimBlock(256);
|
||||
int num_blocks = (n+31)/32;
|
||||
|
||||
|
||||
// Define shapes (dynamic)
|
||||
auto M = int(m);
|
||||
auto N = int(n);
|
||||
auto K = int(k);
|
||||
|
||||
|
||||
printf("%i %i %i %i %i %i\n", m, n, k, lda, ldb, ldc);
|
||||
|
||||
dim3 dimBlock(16, 16);
|
||||
dim3 dimGrid((M+127)/128, (N+127)/128);
|
||||
// auto tC = make_layout(make_shape(Int<16>{}, Int<16>{}));
|
||||
//-
|
||||
//- dim3 dimBlock(size(tC));
|
||||
//- dim3 dimGrid(ceil_div(size(M), size(bM)),
|
||||
//- ceil_div(size(N), size(bN)));
|
||||
cout << num_blocks << endl;
|
||||
gemm_device
|
||||
<<< dimGrid, dimBlock, 0, 0 >>>
|
||||
(M, N, K,
|
||||
<<< num_blocks, dimBlock, 0, 0 >>>
|
||||
(m, n, k,
|
||||
A,
|
||||
B,
|
||||
C, lda, ldb, ldc,
|
||||
|
|
|
@ -2363,12 +2363,6 @@ def test_cutlass3_gemm():
|
|||
print(B)
|
||||
|
||||
C1 = torch.matmul(A, B)
|
||||
print(C1)
|
||||
C2 = F.cutlass3_gemm(A, B.t())
|
||||
print(C2)
|
||||
C2 = F.cutlass3_gemm(A, B)
|
||||
print(C2)
|
||||
C2 = F.cutlass3_gemm(B.t(), A.t().contiguous())
|
||||
print(C2)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user