2022-10-27 11:11:29 +00:00
// Copyright (c) Facebook, Inc. and its affiliates.
//
// This source code is licensed under the MIT license found in the
2021-10-06 02:16:20 +00:00
// LICENSE file in the root directory of this source tree.
#ifndef ops_H
#define ops_H
#include <stdio.h>
#include <iostream>
#include <unistd.h>
#include <assert.h>
2022-11-24 01:52:19 +00:00
#include <hip/hip_runtime_api.h>
#include <hip/hip_fp16.h>
#include <hipblas.h>
// #include <cublasLt.h>
#include <hipsparse.h>
2022-07-22 21:41:05 +00:00
#include <vector>
#include <functional>
2021-10-06 02:16:20 +00:00
2022-11-24 01:52:19 +00:00
typedef struct cublasLtContext* cublasLtHandle_t;
2021-10-06 02:16:20 +00:00
#define CUDA_CHECK_RETURN(value) { \
2022-11-24 01:52:19 +00:00
hipError_t _m_cudaStat = value; \
if (_m_cudaStat != hipSuccess) { \
2021-10-06 02:16:20 +00:00
fprintf(stderr, "Error %s at line %d in file %s\n", \
2022-11-24 01:52:19 +00:00
hipGetErrorString(_m_cudaStat), __LINE__, __FILE__); \
2021-10-06 02:16:20 +00:00
exit(1); \
} }
#define THREADS_PER_BLOCKS (512)
2022-07-22 21:41:05 +00:00
#define CHECK_CUSPARSE(value) { \
2022-11-24 01:52:19 +00:00
hipsparseStatus_t _m_cudaStat = value; \
if (_m_cudaStat != HIPSPARSE_STATUS_SUCCESS) { \
fprintf(stderr, "Error <sparse error> at line %d in file %s\n", \
__LINE__, __FILE__); \
2022-07-22 21:41:05 +00:00
exit(1); \
} }
#define THREADS_PER_BLOCKS (512)
2022-11-24 01:52:19 +00:00
inline void checkCudaStatus(hipError_t status) {
if (status != hipSuccess) {
printf("cuda API failed with status %d: %s\n", status, hipGetErrorString(status));
2022-07-22 21:41:05 +00:00
throw std::logic_error("cuda API failed");
}
}
2022-11-24 01:52:19 +00:00
inline int checkCublasStatus(hipblasStatus_t status) {
if (status != HIPBLAS_STATUS_SUCCESS) {
2022-07-22 21:41:05 +00:00
printf("cuBLAS API failed with status %d\n", status);
//throw std::logic_error("cuBLAS API failed");
return 1;
}
return 0;
}
2021-10-06 02:16:20 +00:00
typedef enum Operations_t
{
ksmul = 0,
} Operations_t;
typedef enum Optimizer_t
{
ADAM = 0,
MOMENTUM = 1,
RMSPROP = 2,
LARS = 3,
2021-11-10 23:10:02 +00:00
ADAGRAD = 4,
2021-10-06 02:16:20 +00:00
} Optimizer_t;
2022-07-22 21:41:05 +00:00
typedef enum Transform_t
{
ROW = 0,
COL = 1,
COL32 = 2,
COL_TURING = 3,
COL_AMPERE = 4,
} Transform_t;
class Context
{
public:
2022-11-24 01:52:19 +00:00
hipblasHandle_t m_handle;
2022-07-22 21:41:05 +00:00
Context()
{
2022-11-24 01:52:19 +00:00
hipblasHandle_t handle;
hipblasCreate(&handle);
2022-07-22 21:41:05 +00:00
m_handle = handle;
}
};
2022-11-24 01:52:19 +00:00
// class ContextLt
// {
// public:
// cublasLtHandle_t m_handle;
2022-07-22 21:41:05 +00:00
2022-11-24 01:52:19 +00:00
// ContextLt()
// {
// cublasLtHandle_t handle;
// cublasLtCreate(&handle);
// m_handle = handle;
// }
2022-07-22 21:41:05 +00:00
2022-11-24 01:52:19 +00:00
// };
2022-07-22 21:41:05 +00:00
class ContextCusparse
{
public:
2022-11-24 01:52:19 +00:00
hipsparseHandle_t m_handle;
2022-07-22 21:41:05 +00:00
ContextCusparse()
{
2022-11-24 01:52:19 +00:00
hipsparseHandle_t handle;
hipsparseCreate(&handle);
2022-07-22 21:41:05 +00:00
m_handle = handle;
}
};
2021-10-06 02:16:20 +00:00
template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n);
void quantize(float *code, float *A, unsigned char *out, int n);
void dequantize(float *code, unsigned char *A, float *out, int n);
2022-11-07 00:27:48 +00:00
template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
2021-10-06 02:16:20 +00:00
template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n);
2022-10-27 11:11:29 +00:00
template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
2021-10-06 02:16:20 +00:00
float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
float beta1, float beta2, float eps, float weight_decay,
2021-10-21 01:37:44 +00:00
int step, float lr, const float gnorm_scale, bool skip_zeros, int n);
2021-10-06 02:16:20 +00:00
template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g, unsigned char* state1, unsigned char* state2,
float *unorm, float max_unorm, float param_norm,
float beta1, float beta2,
2022-10-27 11:11:29 +00:00
float eps, int step, float lr,
2021-10-06 02:16:20 +00:00
float* quantiles1, float* quantiles2,
float* max1, float* max2, float* new_max1, float* new_max2,
float weight_decay,
const float gnorm_scale, int n);
template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g,
2022-10-27 11:11:29 +00:00
unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,
float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale,
2021-10-21 01:37:44 +00:00
bool skip_zeros, int n);
2021-10-06 02:16:20 +00:00
template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n);
void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n);
2022-07-22 21:41:05 +00:00
void gemmex(Context * context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc);
2022-10-27 11:11:29 +00:00
void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc,
2022-07-22 21:41:05 +00:00
long long int strideA, long long int strideB, long long int strideC, int batchCount);
template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc);
template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void transform(cublasLtHandle_t ltHandle, T *A, T *out, int dim1, int dim2);
void cutlass_igemm(bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc);
2022-08-16 17:56:17 +00:00
void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half* bias, int numRows, int numCols);
2022-07-22 21:41:05 +00:00
void getColRowStats(half * A, float *rowStats, float *colStats, int *nnz_count_row, float nnz_threshold, int rows, int cols);
void doubleRowColQuant(half * A, float *rowStats, float *colStats, char *out_col_normed, char *out_row_normed,
int *rowidx, int *colidx, half *val, int *nnz_block_ptr, float threshold, int rows, int cols);
template <int FORMAT, int TRANSPOSE> void transformRowToFormat(char * A, char *out, int rows, int cols);
2022-11-24 01:52:19 +00:00
void spmm_coo(hipsparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_vals, int A_nnz, int A_rows, int A_cols, int B_cols, int ldb, half *B, int ldc, half* C, bool transposed_B);
2022-07-22 21:41:05 +00:00
template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
2022-07-26 19:12:38 +00:00
template <int FORMAT> void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols);
2021-10-06 02:16:20 +00:00
#endif