forked from mrq/bitsandbytes-rocm
Reduce diff
This commit is contained in:
parent
4d1d5b569f
commit
575aa698fa
17
csrc/ops.cu
17
csrc/ops.cu
|
@ -15,7 +15,8 @@ using namespace BinSearch;
|
||||||
using std::cout;
|
using std::cout;
|
||||||
using std::endl;
|
using std::endl;
|
||||||
|
|
||||||
void histogramScatterAdd2D(float *histogram, int *index1, int *index2, float *src, int maxidx1, int n) {
|
void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n)
|
||||||
|
{
|
||||||
int threads = 512;
|
int threads = 512;
|
||||||
int blocks = n/threads;
|
int blocks = n/threads;
|
||||||
blocks = n % threads == 0 ? blocks : blocks + 1;
|
blocks = n % threads == 0 ? blocks : blocks + 1;
|
||||||
|
@ -23,8 +24,8 @@ void histogramScatterAdd2D(float *histogram, int *index1, int *index2, float *sr
|
||||||
CUDA_CHECK_RETURN(cudaPeekAtLastError());
|
CUDA_CHECK_RETURN(cudaPeekAtLastError());
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n)
|
||||||
void estimateQuantiles(T *A, float *code, float offset, int n) {
|
{
|
||||||
int blocks = n/4096;
|
int blocks = n/4096;
|
||||||
blocks = n % 4096 == 0 ? blocks : blocks + 1;
|
blocks = n % 4096 == 0 ? blocks : blocks + 1;
|
||||||
CUDA_CHECK_RETURN(cudaMemset(code, 0, 256*sizeof(float)));
|
CUDA_CHECK_RETURN(cudaMemset(code, 0, 256*sizeof(float)));
|
||||||
|
@ -32,20 +33,14 @@ void estimateQuantiles(T *A, float *code, float offset, int n) {
|
||||||
CUDA_CHECK_RETURN(cudaPeekAtLastError());
|
CUDA_CHECK_RETURN(cudaPeekAtLastError());
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize(float *code, float *A, unsigned char *out, int n) {
|
void quantize(float *code, float *A, unsigned char *out, int n)
|
||||||
|
{
|
||||||
int blocks = n/1024;
|
int blocks = n/1024;
|
||||||
blocks = n % 1024 == 0 ? blocks : blocks + 1;
|
blocks = n % 1024 == 0 ? blocks : blocks + 1;
|
||||||
kQuantize<<<blocks, 1024>>>(code, A, out, n);
|
kQuantize<<<blocks, 1024>>>(code, A, out, n);
|
||||||
CUDA_CHECK_RETURN(cudaPeekAtLastError());
|
CUDA_CHECK_RETURN(cudaPeekAtLastError());
|
||||||
}
|
}
|
||||||
|
|
||||||
void dequantize(float *code, unsigned char *A, float *out, int n) {
|
|
||||||
int blocks = n / 1024;
|
|
||||||
blocks = n % 1024 == 0 ? blocks : blocks + 1;
|
|
||||||
kDequantize<<<blocks, 1024>>>(code, A, out, n);
|
|
||||||
CUDA_CHECK_RETURN(cudaPeekAtLastError());
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n)
|
template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n)
|
||||||
{
|
{
|
||||||
int blocks = n/4096;
|
int blocks = n/4096;
|
||||||
|
|
|
@ -86,7 +86,7 @@ void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, floa
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
{
|
{
|
||||||
if #BUILD_CUDA
|
#if BUILD_CUDA
|
||||||
void cestimate_quantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles_fp32(A, code, offset, n); }
|
void cestimate_quantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles_fp32(A, code, offset, n); }
|
||||||
void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); }
|
void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); }
|
||||||
void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); }
|
void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); }
|
||||||
|
|
Loading…
Reference in New Issue
Block a user