engine/dep/include/FidelityFX/gpu/vrs/ffx_variable_shading.h

490 lines
22 KiB
C

// This file is part of the FidelityFX SDK.
//
// Copyright (C) 2024 Advanced Micro Devices, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
/// @defgroup FfxGPUVrs FidelityFX VRS
/// FidelityFX Variable Shading GPU documentation
///
/// @ingroup FfxGPUEffects
#if defined(FFX_CPP)
#define FFX_CPU
#include <FidelityFX/gpu/ffx_core.h>
FFX_STATIC void ffxVariableShadingGetDispatchInfo(
const FfxDimensions2D resolution, const FfxUInt32 tileSize, const bool useAditionalShadingRates, FfxUInt32& numThreadGroupsX, FfxUInt32& numThreadGroupsY)
{
FfxUInt32 vrsImageWidth = FFX_DIVIDE_ROUNDING_UP(resolution.width, tileSize);
FfxUInt32 vrsImageHeight = FFX_DIVIDE_ROUNDING_UP(resolution.height, tileSize);
if (useAditionalShadingRates)
{
// coarse tiles are potentially 4x4, so each thread computes 4x4 pixels
// as a result an 8x8 threadgroup computes 32x32 pixels
numThreadGroupsX = FFX_DIVIDE_ROUNDING_UP(vrsImageWidth * tileSize, 32);
numThreadGroupsY = FFX_DIVIDE_ROUNDING_UP(vrsImageHeight * tileSize, 32);
}
else
{
// coarse tiles are potentially 2x2, so each thread computes 2x2 pixels
if (tileSize == 8)
{
//each threadgroup computes 4 VRS tiles
numThreadGroupsX = FFX_DIVIDE_ROUNDING_UP(vrsImageWidth, 2);
numThreadGroupsY = FFX_DIVIDE_ROUNDING_UP(vrsImageHeight, 2);
}
else
{
//each threadgroup computes one VRS tile
numThreadGroupsX = vrsImageWidth;
numThreadGroupsY = vrsImageHeight;
}
}
}
#elif defined(FFX_GPU)
// Forward declaration of functions that need to be implemented by shader code using this technique
FfxFloat32 ReadLuminance(FfxInt32x2 pos);
FfxFloat32x2 ReadMotionVec2D(FfxInt32x2 pos);
void WriteVrsImage(FfxInt32x2 pos, FfxUInt32 value);
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE1D_1X = 0x0;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE1D_2X = 0x1;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE1D_4X = 0x2;
#define FFX_VARIABLESHADING_MAKE_SHADING_RATE(x,y) ((x << 2) | (y))
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_1X1 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_1X, FFX_VARIABLESHADING_RATE1D_1X); // 0;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_1X2 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_1X, FFX_VARIABLESHADING_RATE1D_2X); // 0x1;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_2X1 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_1X); // 0x4;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_2X2 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_2X); // 0x5;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_2X4 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_4X); // 0x6;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_4X2 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_4X, FFX_VARIABLESHADING_RATE1D_2X); // 0x9;
FFX_STATIC const FfxUInt32 FFX_VARIABLESHADING_RATE_4X4 = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_4X, FFX_VARIABLESHADING_RATE1D_4X); // 0xa;
#if !defined FFX_VARIABLESHADING_ADDITIONALSHADINGRATES
#if FFX_VARIABLESHADING_TILESIZE == 8
FFX_STATIC const FfxUInt32 FFX_VariableShading_ThreadCount1D = 8;
FFX_STATIC const FfxUInt32 FFX_VariableShading_NumBlocks1D = 2;
#elif FFX_VARIABLESHADING_TILESIZE == 16
FFX_STATIC const FfxUInt32 FFX_VariableShading_ThreadCount1D = 8;
FFX_STATIC const FfxUInt32 FFX_VariableShading_NumBlocks1D = 1;
#else // FFX_VARIABLESHADING_TILESIZE == 32
FFX_STATIC const FfxUInt32 FFX_VariableShading_ThreadCount1D = 16;
FFX_STATIC const FfxUInt32 FFX_VariableShading_NumBlocks1D = 1;
#endif
FFX_STATIC const FfxUInt32 FFX_VariableShading_SampleCount1D = FFX_VariableShading_ThreadCount1D + 2;
FFX_GROUPSHARED FfxUInt32 FFX_VariableShading_LdsGroupReduce;
FFX_STATIC const FfxUInt32 FFX_VariableShading_ThreadCount = FFX_VariableShading_ThreadCount1D * FFX_VariableShading_ThreadCount1D;
FFX_STATIC const FfxUInt32 FFX_VariableShading_SampleCount = FFX_VariableShading_SampleCount1D * FFX_VariableShading_SampleCount1D;
FFX_STATIC const FfxUInt32 FFX_VariableShading_NumBlocks = FFX_VariableShading_NumBlocks1D * FFX_VariableShading_NumBlocks1D;
FFX_GROUPSHARED FfxFloat32x3 FFX_VariableShading_LdsVariance[FFX_VariableShading_SampleCount];
FFX_GROUPSHARED FfxFloat32 FFX_VariableShading_LdsMin[FFX_VariableShading_SampleCount];
FFX_GROUPSHARED FfxFloat32 FFX_VariableShading_LdsMax[FFX_VariableShading_SampleCount];
#else //if defined FFX_VARIABLESHADING_ADDITIONALSHADINGRATES
FFX_STATIC const FfxUInt32 FFX_VariableShading_ThreadCount1D = 8;
FFX_STATIC const FfxUInt32 FFX_VariableShading_NumBlocks1D = 32 / FFX_VARIABLESHADING_TILESIZE;
FFX_STATIC const FfxUInt32 FFX_VariableShading_TilesPerGroup = FFX_VariableShading_NumBlocks1D * FFX_VariableShading_NumBlocks1D;
FFX_STATIC const FfxUInt32 FFX_VariableShading_SampleCount1D = FFX_VariableShading_ThreadCount1D + 2;
FFX_GROUPSHARED FfxUInt32 FFX_VariableShading_LdsGroupReduce[FFX_VariableShading_TilesPerGroup];
FFX_STATIC const FfxUInt32 FFX_VariableShading_ThreadCount = FFX_VariableShading_ThreadCount1D * FFX_VariableShading_ThreadCount1D;
FFX_STATIC const FfxUInt32 FFX_VariableShading_SampleCount = FFX_VariableShading_SampleCount1D * FFX_VariableShading_SampleCount1D;
FFX_STATIC const FfxUInt32 FFX_VariableShading_NumBlocks = FFX_VariableShading_NumBlocks1D * FFX_VariableShading_NumBlocks1D;
// load and compute variance for 1x2, 2x1, 2x2, 2x4, 4x2, 4x4 for 8x8 coarse pixels
FFX_GROUPSHARED FfxUInt32 FFX_VariableShading_LdsShadingRate[FFX_VariableShading_SampleCount];
#endif
// Read luminance value from previous frame's color buffer.
FfxFloat32 VrsGetLuminance(FfxInt32x2 pos)
{
FfxFloat32x2 v = ReadMotionVec2D(pos);
pos = pos - FfxInt32x2(round(v));
// clamp to screen
if (pos.x < 0) pos.x = 0;
if (pos.y < 0) pos.y = 0;
if (pos.x >= Resolution().x)
pos.x = Resolution().x - 1;
if (pos.y >= Resolution().y)
pos.y = Resolution().y - 1;
return ReadLuminance(pos);
}
// Get flattened LDS offset.
FfxInt32 VrsFlattenLdsOffset(FfxInt32x2 coord)
{
coord += 1;
return coord.y * FfxInt32(FFX_VariableShading_SampleCount1D) + coord.x;
}
#if !defined FFX_VARIABLESHADING_ADDITIONALSHADINGRATES
/// Generate and write shading rates to VRS image.
///
/// @param [in] Gid Index for which thread group the compute shader is executing in.
/// @param [in] Gtid Thread index within a thread group the compute shader is executing in.
/// @param [in] Gidx Flattened index of compute shader thread.
///
/// @ingroup FfxGPUVrs
void VrsGenerateVrsImage(FfxUInt32x3 Gid, FfxUInt32x3 Gtid, FfxUInt32 Gidx)
{
FfxInt32x2 tileOffset = FfxInt32x2(Gid.xy * FFX_VariableShading_ThreadCount1D * 2);
FfxInt32x2 baseOffset = tileOffset + FfxInt32x2(-2, -2);
FfxUInt32 index = Gidx;
#if FFX_VARIABLESHADING_TILESIZE > 8
if (index == 0)
{
FFX_VariableShading_LdsGroupReduce = FFX_VARIABLESHADING_RATE_2X2;
}
#endif
// sample source texture (using motion vectors)
while (index < FFX_VariableShading_SampleCount)
{
FfxInt32x2 index2D = 2 * FfxInt32x2(index % FFX_VariableShading_SampleCount1D, index / FFX_VariableShading_SampleCount1D);
FfxFloat32x4 lum;
lum.x = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(0, 0));
lum.y = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(1, 0));
lum.z = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(0, 1));
lum.w = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(1, 1));
// compute the 2x1, 1x2 and 2x2 variance inside the 2x2 coarse pixel region
FfxFloat32x3 delta;
delta.x = ffxMax(abs(lum.x - lum.y), abs(lum.z - lum.w));
delta.y = ffxMax(abs(lum.x - lum.z), abs(lum.y - lum.w));
FfxFloat32x2 minmax = FfxFloat32x2(ffxMin(ffxMin(ffxMin(lum.x, lum.y), lum.z), lum.w), ffxMax(ffxMax(ffxMax(lum.x, lum.y), lum.z), lum.w));
delta.z = minmax.y - minmax.x;
// reduce variance value for fast moving pixels
FfxFloat32 v = length(ReadMotionVec2D(baseOffset + index2D));
v *= MotionFactor();
delta -= v;
minmax.y -= v;
// store variance as well as min/max luminance
FFX_VariableShading_LdsVariance[index] = delta;
FFX_VariableShading_LdsMin[index] = minmax.x;
FFX_VariableShading_LdsMax[index] = minmax.y;
index += FFX_VariableShading_ThreadCount;
}
#if defined(FFX_HLSL)
GroupMemoryBarrierWithGroupSync();
#elif defined(FFX_GLSL)
barrier();
#endif
// upper left coordinate in LDS
FfxInt32x2 threadUV = FfxInt32x2(Gtid.xy);
// look at neighbouring coarse pixels, to combat burn in effect due to frame dependence
FfxFloat32x3 delta = FFX_VariableShading_LdsVariance[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 0))];
// read the minimum luminance for neighbouring coarse pixels
FfxFloat32 minNeighbour = FFX_VariableShading_LdsMin[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, -1))];
minNeighbour = ffxMin(minNeighbour, FFX_VariableShading_LdsMin[VrsFlattenLdsOffset(threadUV + FfxInt32x2(-1, 0))]);
minNeighbour = ffxMin(minNeighbour, FFX_VariableShading_LdsMin[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 1))]);
minNeighbour = ffxMin(minNeighbour, FFX_VariableShading_LdsMin[VrsFlattenLdsOffset(threadUV + FfxInt32x2(1, 0))]);
FfxFloat32 dMin = ffxMax(0.f, FFX_VariableShading_LdsMin[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 0))] - minNeighbour);
// read the maximum luminance for neighbouring coarse pixels
FfxFloat32 maxNeighbour = FFX_VariableShading_LdsMax[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, -1))];
maxNeighbour = ffxMax(maxNeighbour, FFX_VariableShading_LdsMax[VrsFlattenLdsOffset(threadUV + FfxInt32x2(-1, 0))]);
maxNeighbour = ffxMax(maxNeighbour, FFX_VariableShading_LdsMax[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 1))]);
maxNeighbour = ffxMax(maxNeighbour, FFX_VariableShading_LdsMax[VrsFlattenLdsOffset(threadUV + FfxInt32x2(1, 0))]);
FfxFloat32 dMax = ffxMax(0.f, maxNeighbour - FFX_VariableShading_LdsMax[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 0))]);
// assume higher luminance based on min & max values gathered from neighbouring pixels
delta = ffxMax(FfxFloat32x3(0.f, 0.f, 0.f), delta + dMin + dMax);
// Reduction: find maximum variance within VRS tile
#if FFX_VARIABLESHADING_TILESIZE > 8
// with tilesize=16 we compute 1 tile in one 8x8 threadgroup, in wave32 mode we'll need LDS to compute the per tile max
// similar for tilesize=32: 1 tile is computed in a 16x16 threadgroup, so we definitely need LDS
#if defined(FFX_HLSL)
delta = WaveActiveMax(delta);
#elif defined(FFX_GLSL)
delta = subgroupMax(delta);
#endif
#if defined(FFX_HLSL)
if (WaveIsFirstLane())
#elif defined(FFX_GLSL)
if (0 == gl_SubgroupInvocationID)
#endif
{
FfxUInt32 shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_1X, FFX_VARIABLESHADING_RATE1D_1X);
if (delta.z < VarianceCutoff())
{
shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_2X);
}
else
{
if (delta.x > delta.y)
{
shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_1X, (delta.y > VarianceCutoff()) ? FFX_VARIABLESHADING_RATE1D_1X : FFX_VARIABLESHADING_RATE1D_2X);
}
else
{
shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE((delta.x > VarianceCutoff()) ? FFX_VARIABLESHADING_RATE1D_1X : FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_1X);
}
}
#if defined(FFX_HLSL)
InterlockedAnd(FFX_VariableShading_LdsGroupReduce, shadingRate);
#elif defined(FFX_GLSL)
atomicAnd(FFX_VariableShading_LdsGroupReduce, shadingRate);
#endif
}
#if defined(FFX_HLSL)
GroupMemoryBarrierWithGroupSync();
#elif defined(FFX_GLSL)
barrier();
#endif
if (Gidx == 0)
{
// Store
WriteVrsImage(FfxInt32x2(Gid.xy), FFX_VariableShading_LdsGroupReduce);
}
#else
// with tilesize=8 we compute 2x2 tiles in one 8x8 threadgroup
// even in wave32 mode wave FfxInt32rinsics are sufficient
FfxFloat32x4 diffX = FfxFloat32x4(0, 0, 0, 0);
FfxFloat32x4 diffY = FfxFloat32x4(0, 0, 0, 0);
FfxFloat32x4 diffZ = FfxFloat32x4(0, 0, 0, 0);
FfxUInt32 idx = (Gtid.y & (FFX_VariableShading_NumBlocks1D - 1)) * FFX_VariableShading_NumBlocks1D + (Gtid.x & (FFX_VariableShading_NumBlocks1D - 1));
diffX[idx] = delta.x;
diffY[idx] = delta.y;
diffZ[idx] = delta.z;
#if defined(FFX_HLSL)
diffX = WaveActiveMax(diffX);
diffY = WaveActiveMax(diffY);
diffZ = WaveActiveMax(diffZ);
#elif defined(FFX_GLSL)
diffX = subgroupMax(diffX);
diffY = subgroupMax(diffY);
diffZ = subgroupMax(diffZ);
#endif
// write out shading rates to VRS image
if (Gidx < FFX_VariableShading_NumBlocks)
{
FfxFloat32 varH = diffX[Gidx];
FfxFloat32 varV = diffY[Gidx];
FfxFloat32 var = diffZ[Gidx];;
FfxUInt32 shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_1X, FFX_VARIABLESHADING_RATE1D_1X);
if (var < VarianceCutoff())
{
shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_2X);
}
else
{
if (varH > varV)
{
shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE(FFX_VARIABLESHADING_RATE1D_1X, (varV > VarianceCutoff()) ? FFX_VARIABLESHADING_RATE1D_1X : FFX_VARIABLESHADING_RATE1D_2X);
}
else
{
shadingRate = FFX_VARIABLESHADING_MAKE_SHADING_RATE((varH > VarianceCutoff()) ? FFX_VARIABLESHADING_RATE1D_1X : FFX_VARIABLESHADING_RATE1D_2X, FFX_VARIABLESHADING_RATE1D_1X);
}
}
// Store
WriteVrsImage(
FfxInt32x2(Gid.xy * FFX_VariableShading_NumBlocks1D + FfxUInt32x2(Gidx / FFX_VariableShading_NumBlocks1D, Gidx % FFX_VariableShading_NumBlocks1D)), shadingRate);
}
#endif
}
#else // if defined FFX_VARIABLESHADING_ADDITIONALSHADINGRATES
/// Generate and write shading rates to VRS image.
///
/// @param [in] Gid Index for which thread group the compute shader is executing in.
/// @param [in] Gtid Thread index within a thread group the compute shader is executing in.
/// @param [in] Gidx Flattened index of compute shader thread.
///
/// @ingroup FfxGPUVrs
void VrsGenerateVrsImage(FfxUInt32x3 Gid, FfxUInt32x3 Gtid, FfxUInt32 Gidx)
{
FfxInt32x2 tileOffset = FfxInt32x2(Gid.xy * FFX_VariableShading_ThreadCount1D * 4);
FfxInt32x2 baseOffset = tileOffset;
FfxUInt32 index = Gidx;
while (index < FFX_VariableShading_SampleCount)
{
FfxInt32x2 index2D = 4 * FfxInt32x2(index % FFX_VariableShading_SampleCount1D, index / FFX_VariableShading_SampleCount1D);
// reduce shading rate for fast moving pixels
FfxFloat32 v = length(ReadMotionVec2D(baseOffset + index2D));
v *= MotionFactor();
// compute variance for one 4x4 region
FfxFloat32 var2x1 = 0;
FfxFloat32 var1x2 = 0;
FfxFloat32 var2x2 = 0;
FfxFloat32x2 minmax4x2[2] = { FfxFloat32x2(VarianceCutoff(), 0.f), FfxFloat32x2(VarianceCutoff(), 0.f) };
FfxFloat32x2 minmax2x4[2] = { FfxFloat32x2(VarianceCutoff(), 0.f), FfxFloat32x2(VarianceCutoff(), 0.f) };
FfxFloat32x2 minmax4x4 = FfxFloat32x2(VarianceCutoff(), 0.f);
// computes variance for 2x2 tiles
// also we need min/max for 2x4, 4x2 & 4x4
for (FfxUInt32 y = 0; y < 2; y += 1)
{
FfxFloat32 tmpVar4x2 = 0;
for (FfxUInt32 x = 0; x < 2; x += 1)
{
FfxInt32x2 index2D = 4 * FfxInt32x2(index % FFX_VariableShading_SampleCount1D, index / FFX_VariableShading_SampleCount1D) + FfxInt32x2(2 * x, 2 * y);
FfxFloat32x4 lum;
lum.x = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(0, 0));
lum.y = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(1, 0));
lum.z = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(0, 1));
lum.w = VrsGetLuminance(baseOffset + index2D + FfxInt32x2(1, 1));
FfxFloat32x2 minmax = FfxFloat32x2(ffxMin(ffxMin(lum.x, lum.y), ffxMin(lum.z, lum.w)), ffxMax(ffxMax(lum.x, lum.y), ffxMax(lum.z, lum.w)));
FfxFloat32x3 delta;
delta.x = ffxMax(abs(lum.x - lum.y), abs(lum.z - lum.w));
delta.y = ffxMax(abs(lum.x - lum.y), abs(lum.z - lum.w));
delta.z = minmax.y - minmax.x;
// reduce shading rate for fast moving pixels
delta = ffxMax(FfxFloat32x3(0.f, 0.f, 0.f), delta - v);
var2x1 = ffxMax(var2x1, delta.x);
var1x2 = ffxMax(var1x2, delta.y);
var2x2 = ffxMax(var2x2, delta.z);
minmax4x2[y].x = ffxMin(minmax4x2[y].x, minmax.x);
minmax4x2[y].y = ffxMax(minmax4x2[y].y, minmax.y);
minmax2x4[x].x = ffxMin(minmax2x4[x].x, minmax.x);
minmax2x4[x].y = ffxMax(minmax2x4[x].y, minmax.y);
minmax4x4.x = ffxMin(minmax4x4.x, minmax.x);
minmax4x4.y = ffxMax(minmax4x4.y, minmax.y);
}
}
FfxFloat32 var4x2 = ffxMax(0.f, ffxMax(minmax4x2[0].y - minmax4x2[0].x, minmax4x2[1].y - minmax4x2[1].x) - v);
FfxFloat32 var2x4 = ffxMax(0.f, ffxMax(minmax2x4[0].y - minmax2x4[0].x, minmax2x4[1].y - minmax2x4[1].x) - v);
FfxFloat32 var4x4 = ffxMax(0.f, minmax4x4.y - minmax4x4.x - v);
FfxUInt32 shadingRate = FFX_VARIABLESHADING_RATE_1X1;
if (var4x4 < VarianceCutoff()) shadingRate = FFX_VARIABLESHADING_RATE_4X4;
else if (var4x2 < VarianceCutoff()) shadingRate = FFX_VARIABLESHADING_RATE_4X2;
else if (var2x4 < VarianceCutoff()) shadingRate = FFX_VARIABLESHADING_RATE_2X4;
else if (var2x2 < VarianceCutoff()) shadingRate = FFX_VARIABLESHADING_RATE_2X2;
else if (var2x1 < VarianceCutoff()) shadingRate = FFX_VARIABLESHADING_RATE_2X1;
else if (var1x2 < VarianceCutoff()) shadingRate = FFX_VARIABLESHADING_RATE_1X2;
FFX_VariableShading_LdsShadingRate[index] = shadingRate;
index += FFX_VariableShading_ThreadCount;
}
if (Gidx < FFX_VariableShading_TilesPerGroup)
{
FFX_VariableShading_LdsGroupReduce[Gidx] = 0;
}
#if defined(FFX_HLSL)
GroupMemoryBarrierWithGroupSync();
#elif defined(FFX_GLSL)
barrier();
#endif
FfxInt32 i = 0;
FfxInt32x2 threadUV = FfxInt32x2(Gtid.xy);
FfxUInt32 shadingRate[FFX_VariableShading_TilesPerGroup];
for (i = 0; i < FFX_VariableShading_TilesPerGroup; ++i)
{
shadingRate[i] = FFX_VARIABLESHADING_RATE_4X4;
}
FfxUInt32 idx = (Gtid.y & (FFX_VariableShading_NumBlocks1D - 1)) * FFX_VariableShading_NumBlocks1D + (Gtid.x & (FFX_VariableShading_NumBlocks1D - 1));
shadingRate[idx] = FFX_VariableShading_LdsShadingRate[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 0))];
shadingRate[idx] = ffxMin(shadingRate[idx], FFX_VariableShading_LdsShadingRate[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, -1))]);
shadingRate[idx] = ffxMin(shadingRate[idx], FFX_VariableShading_LdsShadingRate[VrsFlattenLdsOffset(threadUV + FfxInt32x2(-1, 0))]);
shadingRate[idx] = ffxMin(shadingRate[idx], FFX_VariableShading_LdsShadingRate[VrsFlattenLdsOffset(threadUV + FfxInt32x2(1, 0))]);
shadingRate[idx] = ffxMin(shadingRate[idx], FFX_VariableShading_LdsShadingRate[VrsFlattenLdsOffset(threadUV + FfxInt32x2(0, 1))]);
// wave-reduce
for (i = 0; i < FFX_VariableShading_TilesPerGroup; ++i)
{
#if defined(FFX_HLSL)
shadingRate[i] = WaveActiveMin(shadingRate[i]);
#elif defined(FFX_GLSL)
shadingRate[i] = subgroupMin(shadingRate[i]);
#endif
}
// threadgroup-reduce
#if FFX_VARIABLESHADING_TILESIZE<16
#if defined(FFX_HLSL)
if (WaveIsFirstLane())
#elif defined(FFX_GLSL)
if (0 == gl_SubgroupInvocationID)
#endif
{
for (i = 0; i < FFX_VariableShading_TilesPerGroup; ++i)
{
#if defined(FFX_HLSL)
InterlockedAnd(FFX_VariableShading_LdsGroupReduce[i], shadingRate[i]);
#elif defined(FFX_GLSL)
atomicAnd(FFX_VariableShading_LdsGroupReduce[i], shadingRate[i]);
#endif
}
}
#if defined(FFX_HLSL)
GroupMemoryBarrierWithGroupSync();
#elif defined(FFX_GLSL)
barrier();
#endif
// write out final rates
if (Gidx < FFX_VariableShading_TilesPerGroup)
{
WriteVrsImage(
FfxInt32x2(Gid.xy * FFX_VariableShading_NumBlocks1D + FfxUInt32x2(Gidx / FFX_VariableShading_NumBlocks1D, Gidx % FFX_VariableShading_NumBlocks1D)),
FFX_VariableShading_LdsGroupReduce[Gidx]);
}
#else
// write out final rates
if (Gidx < FFX_VariableShading_TilesPerGroup)
{
WriteVrsImage(
FfxInt32x2(Gid.xy * FFX_VariableShading_NumBlocks1D + FfxUInt32x2(Gidx / FFX_VariableShading_NumBlocks1D, Gidx % FFX_VariableShading_NumBlocks1D)),
shadingRate[Gidx]);
}
#endif
}
#endif // FFX_VARIABLESHADING_ADDITIONALSHADINGRATES
#endif // FFX_CPP|FFX_GPU