engine/dep/include/FidelityFX/gpu/vrs/ffx_variable_shading.h

// This file is part of the FidelityFX SDK.
//
// Copyright (C) 2024 Advanced Micro Devices, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

/// @defgroup FfxGPUVrs FidelityFX VRS
/// FidelityFX Variable Shading GPU documentation
///
/// @ingroup FfxGPUEffects

#if defined(FFX_CPP)
#define FFX_CPU
#include <FidelityFX/gpu/ffx_core.h>

FFX_STATIC void ffxVariableShadingGetDispatchInfo(
    const FfxDimensions2D resolution, const FfxUInt32 tileSize, const bool useAditionalShadingRates, FfxUInt32& numThreadGroupsX, FfxUInt32& numThreadGroupsY)
{
    FfxUInt32 vrsImageWidth  = FFX_DIVIDE_ROUNDING_UP(resolution.width, tileSize);
    FfxUInt32 vrsImageHeight = FFX_DIVIDE_ROUNDING_UP(resolution.height, tileSize);

    if (useAditionalShadingRates)
    {
        // coarse tiles are potentially 4x4, so each thread computes 4x4 pixels
        // as a result an 8x8 threadgroup computes 32x32 pixels
        numThreadGroupsX = FFX_DIVIDE_ROUNDING_UP(vrsImageWidth * tileSize, 32);
        numThreadGroupsY = FFX_DIVIDE_ROUNDING_UP(vrsImageHeight * tileSize, 32);
    }
    else
    {
        // coarse tiles are potentially 2x2, so each thread computes 2x2 pixels
        if (tileSize == 8)
        {
            //each threadgroup computes 4 VRS tiles
            numThreadGroupsX = FFX_DIVIDE_ROUNDING_UP(vrsImageWidth, 2);
            numThreadGroupsY = FFX_DIVIDE_ROUNDING_UP(vrsImageHeight, 2);
        }
        else
        {
            //each threadgroup computes one VRS tile
            numThreadGroupsX = vrsImageWidth;
            numThreadGroupsY = vrsImageHeight;
        }
    }
}
#elif defined(FFX_GPU)

// Forward declaration of functions that need to be implemented by shader code using this technique
FfxFloat32   ReadLuminance(FfxInt32x2 pos);
FfxFloat32x2 ReadMotionVec2D(FfxInt32x2 pos);
void         WriteVrsImage(FfxInt32x2 pos, FfxUInt32 value);

FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE1D_1X = 0x0;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE1D_2X = 0x1;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE1D_4X = 0x2;
#define FFX_VARIABLESHADING_MAKE_SHADING_RATE(x,y) ((x << 2) | (y))

FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_1X1 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_1X, FFX_VARIABLESHADING_RATE1D_1X); // 0;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_1X2 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_1X, FFX_VARIABLESHADING_RATE1D_2X); // 0x1;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_2X1 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_1X); // 0x4;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_2X2 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_2X); // 0x5;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_2X4 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_4X); // 0x6;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_4X2 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_4X, FFX_VARIABLESHADING_RATE1D_2X); // 0x9;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_4X4 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_4X, FFX_VARIABLESHADING_RATE1D_4X); // 0xa;

#if !defined FFX_VARIABLESHADING_ADDITIONALSHADINGRATES
#if FFX_VARIABLESHADING_TILESIZE == 8
FFX_STATIC const FfxUInt32 FFX_VariableShading_ThreadCount1D = 8;
FFX_STATIC const FfxUInt32 FFX_VariableShading_NumBlocks1D = 2;
#elif FFX_VARIABLESHADING_TILESIZE == 16
FFX_STATIC const FfxUInt32 FFX_VariableShading_ThreadCount1D = 8;
FFX_STATIC const FfxUInt32 FFX_VariableShading_NumBlocks1D = 1;
#else // FFX_VARIABLESHADING_TILESIZE == 32
FFX_STATIC const FfxUInt32 FFX_VariableShading_ThreadCount1D = 16;
FFX_STATIC const FfxUInt32 FFX_VariableShading_NumBlocks1D = 1;
#endif
FFX_STATIC const FfxUInt32 FFX_VariableShading_SampleCount1D = FFX_VariableShading_ThreadCount1D + 2;

FFX_GROUPSHARED FfxUInt32 FFX_VariableShading_LdsGroupReduce;

FFX_STATIC const FfxUInt32 FFX_VariableShading_ThreadCount = FFX_VariableShading_ThreadCount1D * FFX_VariableShading_ThreadCount1D;
FFX_STATIC const FfxUInt32 FFX_VariableShading_SampleCount = FFX_VariableShading_SampleCount1D * FFX_VariableShading_SampleCount1D;
FFX_STATIC const FfxUInt32 FFX_VariableShading_NumBlocks = FFX_VariableShading_NumBlocks1D * FFX_VariableShading_NumBlocks1D;

FFX_GROUPSHARED FfxFloat32x3 FFX_VariableShading_LdsVariance[FFX_VariableShading_SampleCount];
FFX_GROUPSHARED FfxFloat32 FFX_VariableShading_LdsMin[FFX_VariableShading_SampleCount];
FFX_GROUPSHARED FfxFloat32 FFX_VariableShading_LdsMax[FFX_VariableShading_SampleCount];

#else //if defined FFX_VARIABLESHADING_ADDITIONALSHADINGRATES
FFX_STATIC const FfxUInt32 FFX_VariableShading_ThreadCount1D = 8;
FFX_STATIC const FfxUInt32 FFX_VariableShading_NumBlocks1D = 32 / FFX_VARIABLESHADING_TILESIZE;
FFX_STATIC const FfxUInt32 FFX_VariableShading_TilesPerGroup = FFX_VariableShading_NumBlocks1D * FFX_VariableShading_NumBlocks1D;
FFX_STATIC const FfxUInt32 FFX_VariableShading_SampleCount1D = FFX_VariableShading_ThreadCount1D + 2;

FFX_GROUPSHARED FfxUInt32 FFX_VariableShading_LdsGroupReduce[FFX_VariableShading_TilesPerGroup];

FFX_STATIC const FfxUInt32 FFX_VariableShading_ThreadCount = FFX_VariableShading_ThreadCount1D * FFX_VariableShading_ThreadCount1D;
FFX_STATIC const FfxUInt32 FFX_VariableShading_SampleCount = FFX_VariableShading_SampleCount1D * FFX_VariableShading_SampleCount1D;
FFX_STATIC const FfxUInt32 FFX_VariableShading_NumBlocks = FFX_VariableShading_NumBlocks1D * FFX_VariableShading_NumBlocks1D;

// load and compute variance for 1x2, 2x1, 2x2, 2x4, 4x2, 4x4 for 8x8 coarse pixels
FFX_GROUPSHARED FfxUInt32 FFX_VariableShading_LdsShadingRate[FFX_VariableShading_SampleCount];
#endif

// Read luminance value from previous frame's color buffer.
FfxFloat32 VrsGetLuminance(FfxInt32x2 pos)
{
    FfxFloat32x2 v = ReadMotionVec2D(pos);
    pos            = pos - FfxInt32x2(round(v));
    // clamp to screen
    if (pos.x < 0) pos.x = 0;
    if (pos.y < 0) pos.y = 0;
    if (pos.x >= Resolution().x)
        pos.x = Resolution().x - 1;
    if (pos.y >= Resolution().y)
        pos.y = Resolution().y - 1;

    return ReadLuminance(pos);
}

// Get flattened LDS offset.
FfxInt32 VrsFlattenLdsOffset(FfxInt32x2 coord)
{
    coord += 1;
    return coord.y * FfxInt32(FFX_VariableShading_SampleCount1D) + coord.x;
}

#if !defined FFX_VARIABLESHADING_ADDITIONALSHADINGRATES

/// Generate and write shading rates to VRS image.
///
/// @param [in] Gid                      Index for which thread group the compute shader is executing in.
/// @param [in] Gtid                     Thread index within a thread group the compute shader is executing in.
/// @param [in] Gidx                     Flattened index of compute shader thread.
///
/// @ingroup FfxGPUVrs
void VrsGenerateVrsImage(FfxUInt32x3 Gid, FfxUInt32x3 Gtid, FfxUInt32 Gidx)
{
    FfxInt32x2 tileOffset = FfxInt32x2(Gid.xy * FFX_VariableShading_ThreadCount1D * 2);
    FfxInt32x2 baseOffset = tileOffset + FfxInt32x2(-2, -2);
    FfxUInt32 index = Gidx;

#if FFX_VARIABLESHADING_TILESIZE > 8
    if (index == 0)
    {
        FFX_VariableShading_LdsGroupReduce = FFX_VARIABLESHADING_RATE_2X2;
    }
#endif

    // sample source texture (using motion vectors)
    while (index < FFX_VariableShading_SampleCount)
    {
        FfxInt32x2 index2D = 2 * FfxInt32x2(index % FFX_VariableShading_SampleCount1D, index / FFX_VariableShading_SampleCount1D);
        FfxFloat32x4 lum;
        lum.x = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(0, 0));
        lum.y = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(1, 0));
        lum.z = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(0, 1));
        lum.w = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(1, 1));

        // compute the 2x1, 1x2 and 2x2 variance inside the 2x2 coarse pixel region
        FfxFloat32x3 delta;
        delta.x = ffxMax(abs(lum.x - lum.y), abs(lum.z - lum.w));
        delta.y = ffxMax(abs(lum.x - lum.z), abs(lum.y - lum.w));
        FfxFloat32x2 minmax = FfxFloat32x2(ffxMin(ffxMin(ffxMin(lum.x, lum.y), lum.z), lum.w), ffxMax(ffxMax(ffxMax(lum.x, lum.y), lum.z), lum.w));
        delta.z = minmax.y - minmax.x;

        // reduce variance value for fast moving pixels
        FfxFloat32 v = length(ReadMotionVec2D(baseOffset + index2D));
        v *= MotionFactor();
        delta -= v;
        minmax.y -= v;

        // store variance as well as min/max luminance
        FFX_VariableShading_LdsVariance[index] = delta;
        FFX_VariableShading_LdsMin[index] = minmax.x;
        FFX_VariableShading_LdsMax[index] = minmax.y;

        index += FFX_VariableShading_ThreadCount;
    }
#if defined(FFX_HLSL)
    GroupMemoryBarrierWithGroupSync();
#elif defined(FFX_GLSL)
    barrier();
#endif

    // upper left coordinate in LDS
    FfxInt32x2 threadUV = FfxInt32x2(Gtid.xy);

    // look at neighbouring coarse pixels, to combat burn in effect due to frame dependence
    FfxFloat32x3 delta = FFX_VariableShading_LdsVariance[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 0))];

    // read the minimum luminance for neighbouring coarse pixels
    FfxFloat32 minNeighbour = FFX_VariableShading_LdsMin[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, -1))];
    minNeighbour = ffxMin(minNeighbour, FFX_VariableShading_LdsMin[VrsFlattenLdsOffset(threadUV + FfxInt32x2(-1, 0))]);
    minNeighbour = ffxMin(minNeighbour, FFX_VariableShading_LdsMin[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 1))]);
    minNeighbour = ffxMin(minNeighbour, FFX_VariableShading_LdsMin[VrsFlattenLdsOffset(threadUV + FfxInt32x2(1, 0))]);
    FfxFloat32 dMin = ffxMax(0.f, FFX_VariableShading_LdsMin[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 0))] - minNeighbour);

    // read the maximum luminance for neighbouring coarse pixels
    FfxFloat32 maxNeighbour = FFX_VariableShading_LdsMax[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, -1))];
    maxNeighbour = ffxMax(maxNeighbour, FFX_VariableShading_LdsMax[VrsFlattenLdsOffset(threadUV + FfxInt32x2(-1, 0))]);
    maxNeighbour = ffxMax(maxNeighbour, FFX_VariableShading_LdsMax[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 1))]);
    maxNeighbour = ffxMax(maxNeighbour, FFX_VariableShading_LdsMax[VrsFlattenLdsOffset(threadUV + FfxInt32x2(1, 0))]);
    FfxFloat32 dMax = ffxMax(0.f, maxNeighbour - FFX_VariableShading_LdsMax[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 0))]);

    // assume higher luminance based on min & max values gathered from neighbouring pixels
    delta = ffxMax(FfxFloat32x3(0.f, 0.f, 0.f), delta + dMin + dMax);

    // Reduction: find maximum variance within VRS tile
#if FFX_VARIABLESHADING_TILESIZE > 8
    // with tilesize=16 we compute 1 tile in one 8x8 threadgroup, in wave32 mode we'll need LDS to compute the per tile max
    // similar for tilesize=32: 1 tile is computed in a 16x16 threadgroup, so we definitely need LDS
#if defined(FFX_HLSL)
    delta = WaveActiveMax(delta);
#elif defined(FFX_GLSL)
    delta = subgroupMax(delta);
#endif

#if defined(FFX_HLSL)
    if (WaveIsFirstLane())
#elif defined(FFX_GLSL)
    if (0 == gl_SubgroupInvocationID)
#endif
    {
        FfxUInt32 shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_1X, FFX_VARIABLESHADING_RATE1D_1X);

        if (delta.z < VarianceCutoff())
        {
            shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_2X);
        }
        else
        {
            if (delta.x > delta.y)
            {
                shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_1X, (delta.y > VarianceCutoff()) ? FFX_VARIABLESHADING_RATE1D_1X : FFX_VARIABLESHADING_RATE1D_2X);
            }
            else
            {
                shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE((delta.x > VarianceCutoff()) ? FFX_VARIABLESHADING_RATE1D_1X : FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_1X);
            }
        }
#if defined(FFX_HLSL)
        InterlockedAnd(FFX_VariableShading_LdsGroupReduce, shadingRate);
#elif defined(FFX_GLSL)
        atomicAnd(FFX_VariableShading_LdsGroupReduce, shadingRate);
#endif
    }
#if defined(FFX_HLSL)
    GroupMemoryBarrierWithGroupSync();
#elif defined(FFX_GLSL)
    barrier();
#endif

    if (Gidx == 0)
    {
        // Store
        WriteVrsImage(FfxInt32x2(Gid.xy), FFX_VariableShading_LdsGroupReduce);
    }
#else
    // with tilesize=8 we compute 2x2 tiles in one 8x8 threadgroup
    // even in wave32 mode wave FfxInt32rinsics are sufficient
    FfxFloat32x4 diffX = FfxFloat32x4(0, 0, 0, 0);
    FfxFloat32x4 diffY = FfxFloat32x4(0, 0, 0, 0);
    FfxFloat32x4 diffZ = FfxFloat32x4(0, 0, 0, 0);
    FfxUInt32 idx = (Gtid.y & (FFX_VariableShading_NumBlocks1D - 1)) * FFX_VariableShading_NumBlocks1D + (Gtid.x & (FFX_VariableShading_NumBlocks1D - 1));
    diffX[idx] = delta.x;
    diffY[idx] = delta.y;
    diffZ[idx] = delta.z;
#if defined(FFX_HLSL)
    diffX = WaveActiveMax(diffX);
    diffY = WaveActiveMax(diffY);
    diffZ = WaveActiveMax(diffZ);
#elif defined(FFX_GLSL)
    diffX = subgroupMax(diffX);
    diffY = subgroupMax(diffY);
    diffZ = subgroupMax(diffZ);
#endif

    // write out shading rates to VRS image
    if (Gidx < FFX_VariableShading_NumBlocks)
    {
        FfxFloat32 varH = diffX[Gidx];
        FfxFloat32 varV = diffY[Gidx];
        FfxFloat32 var = diffZ[Gidx];;
        FfxUInt32 shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_1X, FFX_VARIABLESHADING_RATE1D_1X);

        if (var < VarianceCutoff())
        {
            shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_2X);
        }
        else
        {
            if (varH > varV)
            {
                shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_1X, (varV > VarianceCutoff()) ? FFX_VARIABLESHADING_RATE1D_1X : FFX_VARIABLESHADING_RATE1D_2X);
            }
            else
            {
                shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE((varH > VarianceCutoff()) ? FFX_VARIABLESHADING_RATE1D_1X : FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_1X);
            }
        }
        // Store
          WriteVrsImage(
            FfxInt32x2(Gid.xy * FFX_VariableShading_NumBlocks1D + FfxUInt32x2(Gidx / FFX_VariableShading_NumBlocks1D, Gidx % FFX_VariableShading_NumBlocks1D)), shadingRate);
    }
#endif
}

#else // if defined FFX_VARIABLESHADING_ADDITIONALSHADINGRATES

/// Generate and write shading rates to VRS image.
///
/// @param [in] Gid                      Index for which thread group the compute shader is executing in.
/// @param [in] Gtid                     Thread index within a thread group the compute shader is executing in.
/// @param [in] Gidx                     Flattened index of compute shader thread.
///
/// @ingroup FfxGPUVrs
void VrsGenerateVrsImage(FfxUInt32x3 Gid, FfxUInt32x3 Gtid, FfxUInt32 Gidx)
{
    FfxInt32x2 tileOffset = FfxInt32x2(Gid.xy * FFX_VariableShading_ThreadCount1D * 4);
    FfxInt32x2 baseOffset = tileOffset;
    FfxUInt32 index      = Gidx;

    while (index < FFX_VariableShading_SampleCount)
    {
        FfxInt32x2 index2D = 4 * FfxInt32x2(index % FFX_VariableShading_SampleCount1D, index / FFX_VariableShading_SampleCount1D);

        // reduce shading rate for fast moving pixels
        FfxFloat32 v = length(ReadMotionVec2D(baseOffset + index2D));
        v *= MotionFactor();

        // compute variance for one 4x4 region
        FfxFloat32 var2x1 = 0;
        FfxFloat32 var1x2 = 0;
        FfxFloat32 var2x2 = 0;
        FfxFloat32x2 minmax4x2[2] = { FfxFloat32x2(VarianceCutoff(), 0.f), FfxFloat32x2(VarianceCutoff(), 0.f) };
        FfxFloat32x2 minmax2x4[2] = { FfxFloat32x2(VarianceCutoff(), 0.f), FfxFloat32x2(VarianceCutoff(), 0.f) };
        FfxFloat32x2 minmax4x4 = FfxFloat32x2(VarianceCutoff(), 0.f);

        // computes variance for 2x2 tiles
        // also we need min/max for 2x4, 4x2 & 4x4
        for (FfxUInt32 y = 0; y < 2; y += 1)
        {
            FfxFloat32 tmpVar4x2 = 0;
            for (FfxUInt32 x = 0; x < 2; x += 1)
            {
                FfxInt32x2 index2D = 4 * FfxInt32x2(index % FFX_VariableShading_SampleCount1D, index / FFX_VariableShading_SampleCount1D) + FfxInt32x2(2 * x, 2 * y);
                FfxFloat32x4 lum;
                lum.x = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(0, 0));
                lum.y = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(1, 0));
                lum.z = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(0, 1));
                lum.w = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(1, 1));

                FfxFloat32x2 minmax = FfxFloat32x2(ffxMin(ffxMin(lum.x, lum.y), ffxMin(lum.z, lum.w)), ffxMax(ffxMax(lum.x, lum.y), ffxMax(lum.z, lum.w)));
                FfxFloat32x3 delta;
                delta.x = ffxMax(abs(lum.x - lum.y), abs(lum.z - lum.w));
                delta.y = ffxMax(abs(lum.x - lum.y), abs(lum.z - lum.w));
                delta.z = minmax.y - minmax.x;

                // reduce shading rate for fast moving pixels
                delta = ffxMax(FfxFloat32x3(0.f, 0.f, 0.f), delta - v);

                var2x1 = ffxMax(var2x1, delta.x);
                var1x2 = ffxMax(var1x2, delta.y);
                var2x2 = ffxMax(var2x2, delta.z);

                minmax4x2[y].x = ffxMin(minmax4x2[y].x, minmax.x);
                minmax4x2[y].y = ffxMax(minmax4x2[y].y, minmax.y);

                minmax2x4[x].x = ffxMin(minmax2x4[x].x, minmax.x);
                minmax2x4[x].y = ffxMax(minmax2x4[x].y, minmax.y);

                minmax4x4.x = ffxMin(minmax4x4.x, minmax.x);
                minmax4x4.y = ffxMax(minmax4x4.y, minmax.y);
            }
        }

        FfxFloat32 var4x2 = ffxMax(0.f, ffxMax(minmax4x2[0].y - minmax4x2[0].x, minmax4x2[1].y - minmax4x2[1].x) - v);
        FfxFloat32 var2x4 = ffxMax(0.f, ffxMax(minmax2x4[0].y - minmax2x4[0].x, minmax2x4[1].y - minmax2x4[1].x) - v);
        FfxFloat32 var4x4 = ffxMax(0.f, minmax4x4.y - minmax4x4.x - v);

        FfxUInt32 shadingRate = FFX_VARIABLESHADING_RATE_1X1;
        if (var4x4 < VarianceCutoff()) shadingRate = FFX_VARIABLESHADING_RATE_4X4;
        else if (var4x2 < VarianceCutoff()) shadingRate = FFX_VARIABLESHADING_RATE_4X2;
        else if (var2x4 < VarianceCutoff()) shadingRate = FFX_VARIABLESHADING_RATE_2X4;
        else if (var2x2 < VarianceCutoff()) shadingRate = FFX_VARIABLESHADING_RATE_2X2;
        else if (var2x1 < VarianceCutoff()) shadingRate = FFX_VARIABLESHADING_RATE_2X1;
        else if (var1x2 < VarianceCutoff()) shadingRate = FFX_VARIABLESHADING_RATE_1X2;

        FFX_VariableShading_LdsShadingRate[index] = shadingRate;

        index += FFX_VariableShading_ThreadCount;
    }

    if (Gidx < FFX_VariableShading_TilesPerGroup)
    {
        FFX_VariableShading_LdsGroupReduce[Gidx] = 0;
    }
#if defined(FFX_HLSL)
    GroupMemoryBarrierWithGroupSync();
#elif defined(FFX_GLSL)
    barrier();
#endif

    FfxInt32 i = 0;
    FfxInt32x2 threadUV = FfxInt32x2(Gtid.xy);

    FfxUInt32 shadingRate[FFX_VariableShading_TilesPerGroup];
    for (i = 0; i < FFX_VariableShading_TilesPerGroup; ++i)
    {
        shadingRate[i] = FFX_VARIABLESHADING_RATE_4X4;
    }
    FfxUInt32 idx    = (Gtid.y & (FFX_VariableShading_NumBlocks1D - 1)) * FFX_VariableShading_NumBlocks1D + (Gtid.x & (FFX_VariableShading_NumBlocks1D - 1));
    shadingRate[idx] = FFX_VariableShading_LdsShadingRate[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 0))];
    shadingRate[idx] = ffxMin(shadingRate[idx], FFX_VariableShading_LdsShadingRate[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, -1))]);
    shadingRate[idx] = ffxMin(shadingRate[idx], FFX_VariableShading_LdsShadingRate[VrsFlattenLdsOffset(threadUV + FfxInt32x2(-1, 0))]);
    shadingRate[idx] = ffxMin(shadingRate[idx], FFX_VariableShading_LdsShadingRate[VrsFlattenLdsOffset(threadUV + FfxInt32x2(1, 0))]);
    shadingRate[idx] = ffxMin(shadingRate[idx], FFX_VariableShading_LdsShadingRate[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 1))]);

    // wave-reduce
    for (i = 0; i < FFX_VariableShading_TilesPerGroup; ++i)
    {
#if defined(FFX_HLSL)
        shadingRate[i] = WaveActiveMin(shadingRate[i]);
#elif defined(FFX_GLSL)
        shadingRate[i] = subgroupMin(shadingRate[i]);
#endif
    }

    // threadgroup-reduce
#if FFX_VARIABLESHADING_TILESIZE<16
#if defined(FFX_HLSL)
    if (WaveIsFirstLane())
#elif defined(FFX_GLSL)
    if (0 == gl_SubgroupInvocationID)
#endif
    {
        for (i = 0; i < FFX_VariableShading_TilesPerGroup; ++i)
        {
#if defined(FFX_HLSL)
            InterlockedAnd(FFX_VariableShading_LdsGroupReduce[i], shadingRate[i]);
#elif defined(FFX_GLSL)
            atomicAnd(FFX_VariableShading_LdsGroupReduce[i], shadingRate[i]);
#endif
        }
    }
#if defined(FFX_HLSL)
    GroupMemoryBarrierWithGroupSync();
#elif defined(FFX_GLSL)
    barrier();
#endif

    // write out final rates
    if (Gidx < FFX_VariableShading_TilesPerGroup)
    {
        WriteVrsImage(
            FfxInt32x2(Gid.xy * FFX_VariableShading_NumBlocks1D + FfxUInt32x2(Gidx / FFX_VariableShading_NumBlocks1D, Gidx % FFX_VariableShading_NumBlocks1D)),
            FFX_VariableShading_LdsGroupReduce[Gidx]);
    }
#else
    // write out final rates
    if (Gidx < FFX_VariableShading_TilesPerGroup)
    {
        WriteVrsImage(
            FfxInt32x2(Gid.xy * FFX_VariableShading_NumBlocks1D + FfxUInt32x2(Gidx / FFX_VariableShading_NumBlocks1D, Gidx % FFX_VariableShading_NumBlocks1D)),
            shadingRate[Gidx]);
    }
#endif


}
#endif // FFX_VARIABLESHADING_ADDITIONALSHADINGRATES
#endif // FFX_CPP|FFX_GPU