just sets __AMDGCN_WAVEFRONT_SIZE forcefully to 32.
Not correct (some GPU's don't support wave32), but works
on the supported GPU's. Can disable with DISABLE_WARP_32
With this blockwise quantize works and with that nf4 is supported.
$(error ERROR:ROCM_TARGETnotset. CallmakewithROCMstring (seehttps://www.llvm.org/docs/AMDGPUUsage.html#processors), for example: make hip ROCM_TARGET=gfx1030)
//TODO: figure out how to make compiler recognize what isn't executed based on template arguments, without the code below in ifndef would trigger static_assert if
//this condition is true
if ((BLOCK_SIZE / NUM_PER_TH % 64) != 0)
if ((REQUESTED_BLOCK_SIZE / NUM_PER_TH % 64) != 0)
{
printf("kQuantizeBlockwise not fully supported on Rocm! BLOCK_SIZE/NUM_PER_TH needs to be divisible by 64.");
return;
}
#else
const int BLOCK_SIZE=REQUESTED_BLOCK_SIZE;
#endif
#ifndef BITS_AND_BYTES_USE_ROCM
const int n_full = gridDim.x * BLOCK_SIZE;
int valid_items = 0;
const int base_idx = (blockIdx.x * BLOCK_SIZE);
@ -854,7 +857,6 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float