Added changes for deployment.
This commit is contained in:
parent
0f40fa3f0a
commit
ac5550a023
1
Makefile
1
Makefile
|
@ -33,7 +33,6 @@ COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
|
|||
COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
|
||||
COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
|
||||
COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
|
||||
COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta
|
||||
|
||||
CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler
|
||||
CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler
|
||||
|
|
|
@ -16,15 +16,12 @@
|
|||
#include <thrust/device_vector.h>
|
||||
#include <mma.h>
|
||||
|
||||
#include <cooperative_groups/memcpy_async.h>
|
||||
#include <cuda/pipeline>
|
||||
|
||||
#define HLF_MAX 65504
|
||||
#define TH 1024
|
||||
#define NUM 4
|
||||
#define NUM_BLOCK 4096
|
||||
|
||||
using namespace nvcuda;
|
||||
|
||||
// source: https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
|
||||
__device__ float atomicMax(float* address, float val) {
|
||||
|
@ -3094,6 +3091,9 @@ template <typename T, typename TCAST, int ITEMS> __device__ inline void vector_l
|
|||
#define WARPS 5
|
||||
template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc)
|
||||
{
|
||||
|
||||
#if __CUDA_ARCH__ >= 750
|
||||
using namespace nvcuda;
|
||||
int col_offset = blockIdx.x *32;
|
||||
const int warp_id = threadIdx.x / 32;
|
||||
const int half_warp_id = threadIdx.x / 16;
|
||||
|
@ -3294,11 +3294,14 @@ template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M,
|
|||
|
||||
if(col_offset + warp_lane < M)
|
||||
out[col_offset + warp_lane] = smem_A[warp_lane];
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize)
|
||||
{
|
||||
|
||||
#if __CUDA_ARCH__ >= 750
|
||||
using namespace nvcuda;
|
||||
int col_offset = blockIdx.x *32;
|
||||
const int warp_id = threadIdx.x / 32;
|
||||
const int half_warp_id = threadIdx.x / 16;
|
||||
|
@ -3459,6 +3462,7 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
|
|||
|
||||
if(col_offset + warp_lane < M)
|
||||
out[col_offset + warp_lane] = smem_A[warp_lane];
|
||||
#endif
|
||||
}
|
||||
|
||||
//#define ROWS 2
|
||||
|
|
11
deploy.sh
11
deploy.sh
|
@ -139,17 +139,6 @@ if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
|
|||
fi
|
||||
|
||||
|
||||
make clean
|
||||
export CUDA_HOME=$BASE_PATH/cuda-10.2
|
||||
make cuda10x_nomatmul CUDA_VERSION=102
|
||||
|
||||
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda102_nocublaslt.so" ]; then
|
||||
# Control will enter here if $DIRECTORY doesn't exist.
|
||||
echo "Compilation unsuccessul!" 1>&2
|
||||
exit 64
|
||||
fi
|
||||
|
||||
|
||||
make clean
|
||||
export CUDA_HOME=$BASE_PATH/cuda-11.0
|
||||
make cuda110_nomatmul CUDA_VERSION=110
|
||||
|
|
Loading…
Reference in New Issue
Block a user