engine/dep/include/FidelityFX/gpu/classifier/ffx_classifier_reflections.h

379 lines
17 KiB
C

// This file is part of the FidelityFX SDK.
//
// Copyright (C) 2024 Advanced Micro Devices, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#define TILE_CLASS_FULL_SW 0
#define TILE_CLASS_HALF_SW 1
#define TILE_CLASS_FULL_HW 2
#define FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
#include "ffx_classifier_reflections_common.h"
FfxFloat32x2 Hash22(FfxFloat32x2 p)
{
FfxFloat32x3 p3 = ffxFract(FfxFloat32x3(p.xyx) * FfxFloat32x3(.1031, .1030, .0973));
p3 += dot(p3, p3.yzx + 33.33);
return ffxFract((p3.xx + p3.yz) * p3.zy);
}
FfxFloat32x2 GetRandom(FfxUInt32x2 index)
{
FfxFloat32 v = 0.152f;
FfxFloat32x2 pos = (FfxFloat32x2(index) * v + FfxFloat32(FrameIndex()) / 60.0f * 1500.0f + 50.0f);
return Hash22(pos);
}
FfxFloat32x2 GetRandomLastFrame(FfxUInt32x2 index)
{
FfxFloat32 v = 0.152f;
FfxFloat32x2 pos = (FfxFloat32x2(index) * v + FfxFloat32(FrameIndex() - 1) / 60.0f * 1500.0f + 50.0f);
return Hash22(pos);
}
FfxBoolean IsSW(FfxFloat32 hitcounter, FfxFloat32 misscounter, FfxFloat32 rnd)
{
// Turn a random tile full hybrid once in a while to get the opportunity for testing HiZ traversal
return rnd <= (+HybridSpawnRate() + hitcounter - misscounter * HybridMissWeight());
}
FfxBoolean IsConverged(FfxUInt32x2 pixel_coordinate, FfxFloat32x2 uv)
{
FfxFloat32x2 motion_vector = LoadMotionVector(FfxInt32x2(pixel_coordinate));
;
return SampleVarianceHistory(uv - motion_vector) < VRTVarianceThreshold();
}
// In case no ray is traced we need to clear the buffers
void FillEnvironment(FfxUInt32x2 ray_coord, FfxFloat32 factor)
{
// Fall back to the environment probe
FfxUInt32x2 screen_size = FfxUInt32x2(ReflectionWidth(), ReflectionHeight());
FfxFloat32x2 uv = (ray_coord + 0.5) * InverseRenderSize();
FfxFloat32x3 world_space_normal = LoadWorldSpaceNormal(FfxInt32x2(ray_coord));
FfxFloat32 roughness = LoadRoughnessFromMaterialParametersInput(FfxUInt32x3(ray_coord, 0));
FfxFloat32 z = GetInputDepth(ray_coord);
FfxFloat32x3 screen_uv_space_ray_origin = FfxFloat32x3(uv, z);
FfxFloat32x3 view_space_ray = ScreenSpaceToViewSpace(screen_uv_space_ray_origin);
FfxFloat32x3 view_space_ray_direction = normalize(view_space_ray);
FfxFloat32x3 view_space_surface_normal = FFX_MATRIX_MULTIPLY(ViewMatrix(), FfxFloat32x4(world_space_normal, 0)).xyz;
FfxFloat32x3 view_space_reflected_direction = reflect(view_space_ray_direction, view_space_surface_normal);
FfxFloat32x3 world_space_reflected_direction = FFX_MATRIX_MULTIPLY(InvView(), FfxFloat32x4(view_space_reflected_direction, 0)).xyz;
FfxFloat32x3 world_space_ray_origin = FFX_MATRIX_MULTIPLY(InvView(), FfxFloat32x4(view_space_ray, 1)).xyz;
FfxFloat32x3 env_sample = SampleEnvironmentMap(world_space_reflected_direction, sqrt(roughness));
if (!any(isnan(env_sample)))
StoreRadiance(ray_coord, env_sample.xyzz * factor);
else
StoreRadiance(ray_coord, (0.0f).xxxx);
}
void ZeroBuffers(FfxUInt32x2 dispatch_thread_id)
{
StoreRadiance(dispatch_thread_id, (0.0f).xxxx);
}
FfxFloat32x2 GetSurfaceReprojection(FfxFloat32x2 uv, FfxFloat32x2 motion_vector)
{
// Reflector position reprojection
FfxFloat32x2 history_uv = uv - motion_vector;
return history_uv;
}
FfxBoolean IsBaseRay(FfxUInt32x2 dispatch_thread_id, FfxUInt32 samples_per_quad)
{
switch (samples_per_quad)
{
case 1:
return ((dispatch_thread_id.x & 1) | (dispatch_thread_id.y & 1)) == 0; // Deactivates 3 out of 4 rays
case 2:
return (dispatch_thread_id.x & 1) == (dispatch_thread_id.y & 1); // Deactivates 2 out of 4 rays. Keeps diagonal.
default: // case 4:
return true;
}
}
FFX_GROUPSHARED FfxUInt32 g_TileCount;
FFX_GROUPSHARED FfxInt32 g_TileClass;
FFX_GROUPSHARED FfxUInt32 g_SWCount;
FFX_GROUPSHARED FfxUInt32 g_SWCountTotal;
FFX_GROUPSHARED FfxUInt32 g_base_ray_index_sw;
void ClassifyTiles(FfxUInt32x2 dispatch_thread_id,
FfxUInt32x2 group_thread_id,
FfxFloat32 roughness,
FfxFloat32x3 view_space_surface_normal,
FfxFloat32 depth,
FfxInt32x2 screen_size,
FfxUInt32 samples_per_quad,
FfxBoolean enable_temporal_variance_guided_tracing,
FfxBoolean enable_hitcounter,
FfxBoolean enable_screen_space_tracing,
FfxBoolean enable_hw_ray_tracing)
{
FfxUInt32 flat_group_thread_id = group_thread_id.x + group_thread_id.y * 8;
FfxBoolean is_first_lane_of_wave = ffxWaveIsFirstLane();
if (group_thread_id.x == 0 && group_thread_id.y == 0)
{
// Initialize group shared variables
g_TileCount = 0;
g_SWCount = 0;
g_SWCountTotal = 0;
g_base_ray_index_sw = 0;
#ifdef FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
// Initialize per 8x8 tile hit counter
if (enable_hitcounter)
{
// In case we do hybrid
if (enable_screen_space_tracing && enable_hw_ray_tracing)
{
// Feedback counters
// See Intersect.hlsl
FfxUInt32 hitcounter = 0;
// Use surface motion vectors of one of the 8x8 pixels in the tile to reproject statistics from the previous frame
// Helps a lot in movement to sustain temoporal coherence
#define FFX_CLASSIFIER_CLASSIFICATION_REPROJECT_HITCOUNTER
#ifdef FFX_CLASSIFIER_CLASSIFICATION_REPROJECT_HITCOUNTER
{
// Grab motion vector from a random point in the subgroup
FfxFloat32x2 xi = GetRandom(dispatch_thread_id.xy / 8);
FfxInt32x2 mix = FfxInt32x2(xi * 8.0f);
FfxFloat32x2 motion_vector = LoadMotionVector(FfxInt32x2(dispatch_thread_id) + mix);
FfxFloat32x2 uv8 = (FfxFloat32x2(dispatch_thread_id.xy + mix)) / FFX_DNSR_Reflections_RoundUp8(screen_size);
FfxFloat32x2 surface_reprojection_uv = GetSurfaceReprojection(uv8, motion_vector);
hitcounter = LoadHitCounterHistory(FfxUInt32x2(surface_reprojection_uv * (FFX_DNSR_Reflections_RoundUp8(screen_size) / 8)));
}
#endif // FFX_CLASSIFIER_CLASSIFICATION_REPROJECT_HITCOUNTER
// Use 3x3 region to grab the biggest success rate and create a safe band of hybrid rays to hide artefacts in movements
#define FFX_CLASSIFIER_CLASSIFICATION_SAFEBAND
#ifdef FFX_CLASSIFIER_CLASSIFICATION_SAFEBAND
FfxUInt32 same_pixel_hitcounter = 0;
// We need a safe band for some geometry not in the BVH to avoid fireflies
const FfxInt32 radius = 1;
for (FfxInt32 y = -radius; y <= radius; y++)
{
for (FfxInt32 x = -radius; x <= radius; x++)
{
FfxUInt32 pt = LoadHitCounterHistory(dispatch_thread_id.xy / 8 + FfxInt32x2(x, y));
if (FFX_Hitcounter_GetSWHits(pt) > FFX_Hitcounter_GetSWHits(same_pixel_hitcounter))
same_pixel_hitcounter = pt;
}
}
#else // FFX_CLASSIFIER_CLASSIFICATION_SAFEBAND
FfxUInt32 same_pixel_hitcounter = LoadHitCounterHistory(dispatch_thread_id.xy / 8);
#endif // FFX_CLASSIFIER_CLASSIFICATION_SAFEBAND
// Again compare with the same pixel and Pick the one with the biggest success rate
if (FFX_Hitcounter_GetSWHits(hitcounter) < FFX_Hitcounter_GetSWHits(same_pixel_hitcounter))
hitcounter = same_pixel_hitcounter;
FfxFloat32 rnd = GetRandom(dispatch_thread_id.xy / 8).x;
FfxFloat32 rnd_last = GetRandomLastFrame(dispatch_thread_id.xy / 8).x;
FfxFloat32 sw_hitcount_new = FfxFloat32(FFX_Hitcounter_GetSWHits(hitcounter));
FfxFloat32 sw_hitcount_old = FfxFloat32(FFX_Hitcounter_GetOldSWHits(hitcounter));
FfxFloat32 sw_misscount_new = FfxFloat32(FFX_Hitcounter_GetSWMisses(hitcounter));
FfxFloat32 sw_misscount_old = FfxFloat32(FFX_Hitcounter_GetOldSWMisses(hitcounter));
FfxBoolean new_class = IsSW(sw_hitcount_new, sw_misscount_new, rnd);
FfxBoolean old_class = IsSW(sw_hitcount_old, sw_misscount_old, rnd_last);
// To make transition less obvious we do and extra checkerboard stage
if (new_class == old_class)
{
if (new_class)
{
g_TileClass = TILE_CLASS_FULL_SW;
}
else
{
g_TileClass = TILE_CLASS_FULL_HW;
}
}
else
{
g_TileClass = TILE_CLASS_HALF_SW;
}
sw_hitcount_old = sw_hitcount_new;
sw_misscount_old = sw_misscount_new;
StoreHitCounter(dispatch_thread_id.xy / 8,
(FfxUInt32(clamp(sw_hitcount_old, 0.0f, 255.0f)) << 8) | (FfxUInt32(clamp(sw_misscount_old, 0.0f, 255.0f)) << 24));
}
}
else
{
g_TileClass = TILE_CLASS_FULL_SW;
}
#endif // FFX_HYBRID_REFLECTIONS
}
FFX_GROUP_MEMORY_BARRIER;
// First we figure out on a per thread basis if we need to shoot a reflection ray
FfxBoolean is_on_screen = (dispatch_thread_id.x < screen_size.x) && (dispatch_thread_id.y < screen_size.y);
// Allow for additional engine side checks. For example engines could additionally only cast reflection rays for specific depth ranges
FfxBoolean is_surface = !IsBackground(depth);
// Don't shoot a ray on very rough surfaces
FfxBoolean is_glossy_reflection = is_surface && IsGlossyReflection(roughness);
FfxBoolean needs_ray = is_on_screen && is_glossy_reflection;
// Decide which ray to keep
FfxBoolean is_base_ray = IsBaseRay(dispatch_thread_id, samples_per_quad);
FfxBoolean is_converged = true;
if (enable_temporal_variance_guided_tracing)
{
FfxFloat32x2 uv = (dispatch_thread_id + 0.5) / screen_size;
is_converged = IsConverged(dispatch_thread_id, uv);
}
needs_ray = needs_ray && (is_base_ray || !is_converged);
// Extra check for back-facing rays, fresnel, mirror etc.
if (abs(view_space_surface_normal.z) > ReflectionsBackfacingThreshold())
{
FillEnvironment(dispatch_thread_id, IBLFactor());
needs_ray = false;
}
// We need denoiser even for mirrors since ssr/hw transition ends up creating poping tile firefies.
FfxBoolean needs_denoiser = is_glossy_reflection;
// Next we have to figure out for which pixels that ray is creating the values for. Thus, if we have to copy its value horizontal, vertical or across.
FfxBoolean require_copy =
!needs_ray && needs_denoiser; // Our pixel only requires a copy if we want to run a denoiser on it but don't want to shoot a ray for it.
FfxBoolean copy_horizontal = FfxBoolean(ffxWaveXorU1(FfxUInt32(require_copy), 1)) && (samples_per_quad != 4) && is_base_ray; // QuadReadAcrossX
FfxBoolean copy_vertical = FfxBoolean(ffxWaveXorU1(FfxUInt32(require_copy), 2)) && (samples_per_quad == 1) && is_base_ray; // QuadReadAcrossY
FfxBoolean copy_diagonal = FfxBoolean(ffxWaveXorU1(FfxUInt32(require_copy), 3)) && (samples_per_quad == 1) && is_base_ray; // QuadReadAcrossDiagonal
FfxBoolean needs_sw_ray = true;
// In case there's only software rays we don't do hybridization
#ifdef FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
needs_sw_ray = needs_ray && enable_screen_space_tracing;
FfxBoolean needs_hw_ray = false;
if (enable_hw_ray_tracing && roughness < RTRoughnessThreshold())
{
FfxBoolean checkerboard = ((group_thread_id.x ^ group_thread_id.y) & 1) == 0;
needs_sw_ray = needs_sw_ray && ((g_TileClass == TILE_CLASS_FULL_SW ? true : (g_TileClass == TILE_CLASS_HALF_SW ? checkerboard : false)));
needs_hw_ray = needs_ray && !needs_sw_ray;
}
#endif // FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
FfxUInt32 local_ray_index_in_wave_sw = ffxWavePrefixCountBits(needs_sw_ray);
FfxUInt32 wave_ray_offset_in_group_sw;
FfxUInt32 wave_ray_count_sw = ffxWaveActiveCountBits(needs_sw_ray);
#ifdef FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
FfxUInt32 local_ray_index_in_wave_hw = ffxWavePrefixCountBits(needs_hw_ray);
FfxUInt32 wave_ray_count_hw = ffxWaveActiveCountBits(needs_hw_ray);
FfxUInt32 base_ray_index_hw = 0;
#endif // FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
if (is_first_lane_of_wave)
{
if (wave_ray_count_sw > 0)
{
#ifdef FFX_GLSL
wave_ray_offset_in_group_sw = FFX_ATOMIC_ADD(g_SWCount, FfxInt32(wave_ray_count_sw));
#else
InterlockedAdd(g_SWCount, wave_ray_count_sw, wave_ray_offset_in_group_sw);
#endif
}
#ifdef FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
if (wave_ray_count_hw > 0)
IncrementRayCounterHW(wave_ray_count_hw, base_ray_index_hw);
#endif // FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
}
base_ray_index_hw = ffxWaveReadLaneFirstU1(base_ray_index_hw);
wave_ray_offset_in_group_sw = ffxWaveReadLaneFirstU1(wave_ray_offset_in_group_sw);
FFX_GROUP_MEMORY_BARRIER;
if (flat_group_thread_id == 0 && g_SWCount > 0)
{
// [IMPORTANT] We need to round up to the multiple of 32 for software rays, because of the atomic increment coalescing optimization
g_SWCountTotal = g_SWCount < 32 ? 32 : (g_SWCount > 32 ? 64 : 32);
IncrementRayCounterSW(g_SWCountTotal, g_base_ray_index_sw);
}
FFX_GROUP_MEMORY_BARRIER;
if (needs_sw_ray)
{
FfxUInt32 ray_index_sw = g_base_ray_index_sw + wave_ray_offset_in_group_sw + local_ray_index_in_wave_sw;
StoreRay(ray_index_sw, dispatch_thread_id, copy_horizontal, copy_vertical, copy_diagonal);
}
#ifdef FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
else if (needs_hw_ray)
{
FfxUInt32 ray_index_hw = base_ray_index_hw + local_ray_index_in_wave_hw;
StoreRayHW(ray_index_hw, dispatch_thread_id, copy_horizontal, copy_vertical, copy_diagonal);
}
#endif // FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
if (flat_group_thread_id < g_SWCountTotal - g_SWCount)
{
// [IMPORTANT] We need to round up to the multiple of 32 for software rays, because of the atomic increment coalescing optimization
// Emit helper(dead) lanes to fill up 32 lanes per 8x8 tile
FfxUInt32 ray_index_sw = g_base_ray_index_sw + g_SWCount + flat_group_thread_id;
StoreRaySWHelper(ray_index_sw);
}
// We only need denoiser if we trace any rays in the tile
if (is_first_lane_of_wave && (wave_ray_count_sw > 0
#ifdef FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
|| wave_ray_count_hw > 0
#endif // FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
))
{
FFX_ATOMIC_ADD(g_TileCount, 1);
}
FFX_GROUP_MEMORY_BARRIER; // Wait until all waves wrote into g_TileCount
if (g_TileCount > 0)
{
if (group_thread_id.x == 0 && group_thread_id.y == 0)
{
FfxUInt32 tile_index;
IncrementDenoiserTileCounter(tile_index);
StoreDenoiserTile(tile_index, dispatch_thread_id);
}
}
if ((!needs_ray && !require_copy) // Discarded for some reason
|| (needs_ray && !needs_hw_ray && !needs_sw_ray) // Or needs a ray but was discarded for some other reason
)
{
if (is_surface)
{
FillEnvironment(dispatch_thread_id, IBLFactor());
}
else
ZeroBuffers(dispatch_thread_id);
}
}