379 lines
17 KiB
C
379 lines
17 KiB
C
// This file is part of the FidelityFX SDK.
|
|
//
|
|
// Copyright (C) 2024 Advanced Micro Devices, Inc.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files(the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions :
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
|
|
#define TILE_CLASS_FULL_SW 0
|
|
#define TILE_CLASS_HALF_SW 1
|
|
#define TILE_CLASS_FULL_HW 2
|
|
|
|
#define FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
|
|
|
|
#include "ffx_classifier_reflections_common.h"
|
|
|
|
FfxFloat32x2 Hash22(FfxFloat32x2 p)
|
|
{
|
|
FfxFloat32x3 p3 = ffxFract(FfxFloat32x3(p.xyx) * FfxFloat32x3(.1031, .1030, .0973));
|
|
p3 += dot(p3, p3.yzx + 33.33);
|
|
return ffxFract((p3.xx + p3.yz) * p3.zy);
|
|
}
|
|
|
|
FfxFloat32x2 GetRandom(FfxUInt32x2 index)
|
|
{
|
|
FfxFloat32 v = 0.152f;
|
|
FfxFloat32x2 pos = (FfxFloat32x2(index) * v + FfxFloat32(FrameIndex()) / 60.0f * 1500.0f + 50.0f);
|
|
return Hash22(pos);
|
|
}
|
|
|
|
FfxFloat32x2 GetRandomLastFrame(FfxUInt32x2 index)
|
|
{
|
|
FfxFloat32 v = 0.152f;
|
|
FfxFloat32x2 pos = (FfxFloat32x2(index) * v + FfxFloat32(FrameIndex() - 1) / 60.0f * 1500.0f + 50.0f);
|
|
return Hash22(pos);
|
|
}
|
|
|
|
FfxBoolean IsSW(FfxFloat32 hitcounter, FfxFloat32 misscounter, FfxFloat32 rnd)
|
|
{
|
|
// Turn a random tile full hybrid once in a while to get the opportunity for testing HiZ traversal
|
|
return rnd <= (+HybridSpawnRate() + hitcounter - misscounter * HybridMissWeight());
|
|
}
|
|
|
|
FfxBoolean IsConverged(FfxUInt32x2 pixel_coordinate, FfxFloat32x2 uv)
|
|
{
|
|
FfxFloat32x2 motion_vector = LoadMotionVector(FfxInt32x2(pixel_coordinate));
|
|
;
|
|
return SampleVarianceHistory(uv - motion_vector) < VRTVarianceThreshold();
|
|
}
|
|
|
|
// In case no ray is traced we need to clear the buffers
|
|
void FillEnvironment(FfxUInt32x2 ray_coord, FfxFloat32 factor)
|
|
{
|
|
// Fall back to the environment probe
|
|
FfxUInt32x2 screen_size = FfxUInt32x2(ReflectionWidth(), ReflectionHeight());
|
|
FfxFloat32x2 uv = (ray_coord + 0.5) * InverseRenderSize();
|
|
FfxFloat32x3 world_space_normal = LoadWorldSpaceNormal(FfxInt32x2(ray_coord));
|
|
FfxFloat32 roughness = LoadRoughnessFromMaterialParametersInput(FfxUInt32x3(ray_coord, 0));
|
|
FfxFloat32 z = GetInputDepth(ray_coord);
|
|
FfxFloat32x3 screen_uv_space_ray_origin = FfxFloat32x3(uv, z);
|
|
FfxFloat32x3 view_space_ray = ScreenSpaceToViewSpace(screen_uv_space_ray_origin);
|
|
FfxFloat32x3 view_space_ray_direction = normalize(view_space_ray);
|
|
FfxFloat32x3 view_space_surface_normal = FFX_MATRIX_MULTIPLY(ViewMatrix(), FfxFloat32x4(world_space_normal, 0)).xyz;
|
|
FfxFloat32x3 view_space_reflected_direction = reflect(view_space_ray_direction, view_space_surface_normal);
|
|
FfxFloat32x3 world_space_reflected_direction = FFX_MATRIX_MULTIPLY(InvView(), FfxFloat32x4(view_space_reflected_direction, 0)).xyz;
|
|
FfxFloat32x3 world_space_ray_origin = FFX_MATRIX_MULTIPLY(InvView(), FfxFloat32x4(view_space_ray, 1)).xyz;
|
|
|
|
FfxFloat32x3 env_sample = SampleEnvironmentMap(world_space_reflected_direction, sqrt(roughness));
|
|
|
|
if (!any(isnan(env_sample)))
|
|
StoreRadiance(ray_coord, env_sample.xyzz * factor);
|
|
else
|
|
StoreRadiance(ray_coord, (0.0f).xxxx);
|
|
}
|
|
|
|
void ZeroBuffers(FfxUInt32x2 dispatch_thread_id)
|
|
{
|
|
StoreRadiance(dispatch_thread_id, (0.0f).xxxx);
|
|
}
|
|
|
|
FfxFloat32x2 GetSurfaceReprojection(FfxFloat32x2 uv, FfxFloat32x2 motion_vector)
|
|
{
|
|
// Reflector position reprojection
|
|
FfxFloat32x2 history_uv = uv - motion_vector;
|
|
return history_uv;
|
|
}
|
|
|
|
FfxBoolean IsBaseRay(FfxUInt32x2 dispatch_thread_id, FfxUInt32 samples_per_quad)
|
|
{
|
|
switch (samples_per_quad)
|
|
{
|
|
case 1:
|
|
return ((dispatch_thread_id.x & 1) | (dispatch_thread_id.y & 1)) == 0; // Deactivates 3 out of 4 rays
|
|
case 2:
|
|
return (dispatch_thread_id.x & 1) == (dispatch_thread_id.y & 1); // Deactivates 2 out of 4 rays. Keeps diagonal.
|
|
default: // case 4:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
FFX_GROUPSHARED FfxUInt32 g_TileCount;
|
|
FFX_GROUPSHARED FfxInt32 g_TileClass;
|
|
FFX_GROUPSHARED FfxUInt32 g_SWCount;
|
|
FFX_GROUPSHARED FfxUInt32 g_SWCountTotal;
|
|
FFX_GROUPSHARED FfxUInt32 g_base_ray_index_sw;
|
|
|
|
void ClassifyTiles(FfxUInt32x2 dispatch_thread_id,
|
|
FfxUInt32x2 group_thread_id,
|
|
FfxFloat32 roughness,
|
|
FfxFloat32x3 view_space_surface_normal,
|
|
FfxFloat32 depth,
|
|
FfxInt32x2 screen_size,
|
|
FfxUInt32 samples_per_quad,
|
|
FfxBoolean enable_temporal_variance_guided_tracing,
|
|
FfxBoolean enable_hitcounter,
|
|
FfxBoolean enable_screen_space_tracing,
|
|
FfxBoolean enable_hw_ray_tracing)
|
|
{
|
|
FfxUInt32 flat_group_thread_id = group_thread_id.x + group_thread_id.y * 8;
|
|
FfxBoolean is_first_lane_of_wave = ffxWaveIsFirstLane();
|
|
|
|
if (group_thread_id.x == 0 && group_thread_id.y == 0)
|
|
{
|
|
// Initialize group shared variables
|
|
g_TileCount = 0;
|
|
g_SWCount = 0;
|
|
g_SWCountTotal = 0;
|
|
g_base_ray_index_sw = 0;
|
|
|
|
#ifdef FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
|
|
// Initialize per 8x8 tile hit counter
|
|
if (enable_hitcounter)
|
|
{
|
|
// In case we do hybrid
|
|
if (enable_screen_space_tracing && enable_hw_ray_tracing)
|
|
{
|
|
// Feedback counters
|
|
// See Intersect.hlsl
|
|
FfxUInt32 hitcounter = 0;
|
|
|
|
// Use surface motion vectors of one of the 8x8 pixels in the tile to reproject statistics from the previous frame
|
|
// Helps a lot in movement to sustain temoporal coherence
|
|
#define FFX_CLASSIFIER_CLASSIFICATION_REPROJECT_HITCOUNTER
|
|
#ifdef FFX_CLASSIFIER_CLASSIFICATION_REPROJECT_HITCOUNTER
|
|
{
|
|
// Grab motion vector from a random point in the subgroup
|
|
FfxFloat32x2 xi = GetRandom(dispatch_thread_id.xy / 8);
|
|
FfxInt32x2 mix = FfxInt32x2(xi * 8.0f);
|
|
FfxFloat32x2 motion_vector = LoadMotionVector(FfxInt32x2(dispatch_thread_id) + mix);
|
|
FfxFloat32x2 uv8 = (FfxFloat32x2(dispatch_thread_id.xy + mix)) / FFX_DNSR_Reflections_RoundUp8(screen_size);
|
|
FfxFloat32x2 surface_reprojection_uv = GetSurfaceReprojection(uv8, motion_vector);
|
|
hitcounter = LoadHitCounterHistory(FfxUInt32x2(surface_reprojection_uv * (FFX_DNSR_Reflections_RoundUp8(screen_size) / 8)));
|
|
}
|
|
#endif // FFX_CLASSIFIER_CLASSIFICATION_REPROJECT_HITCOUNTER
|
|
|
|
// Use 3x3 region to grab the biggest success rate and create a safe band of hybrid rays to hide artefacts in movements
|
|
#define FFX_CLASSIFIER_CLASSIFICATION_SAFEBAND
|
|
#ifdef FFX_CLASSIFIER_CLASSIFICATION_SAFEBAND
|
|
FfxUInt32 same_pixel_hitcounter = 0;
|
|
// We need a safe band for some geometry not in the BVH to avoid fireflies
|
|
const FfxInt32 radius = 1;
|
|
for (FfxInt32 y = -radius; y <= radius; y++)
|
|
{
|
|
for (FfxInt32 x = -radius; x <= radius; x++)
|
|
{
|
|
FfxUInt32 pt = LoadHitCounterHistory(dispatch_thread_id.xy / 8 + FfxInt32x2(x, y));
|
|
if (FFX_Hitcounter_GetSWHits(pt) > FFX_Hitcounter_GetSWHits(same_pixel_hitcounter))
|
|
same_pixel_hitcounter = pt;
|
|
}
|
|
}
|
|
#else // FFX_CLASSIFIER_CLASSIFICATION_SAFEBAND
|
|
FfxUInt32 same_pixel_hitcounter = LoadHitCounterHistory(dispatch_thread_id.xy / 8);
|
|
#endif // FFX_CLASSIFIER_CLASSIFICATION_SAFEBAND
|
|
|
|
// Again compare with the same pixel and Pick the one with the biggest success rate
|
|
if (FFX_Hitcounter_GetSWHits(hitcounter) < FFX_Hitcounter_GetSWHits(same_pixel_hitcounter))
|
|
hitcounter = same_pixel_hitcounter;
|
|
|
|
FfxFloat32 rnd = GetRandom(dispatch_thread_id.xy / 8).x;
|
|
FfxFloat32 rnd_last = GetRandomLastFrame(dispatch_thread_id.xy / 8).x;
|
|
FfxFloat32 sw_hitcount_new = FfxFloat32(FFX_Hitcounter_GetSWHits(hitcounter));
|
|
FfxFloat32 sw_hitcount_old = FfxFloat32(FFX_Hitcounter_GetOldSWHits(hitcounter));
|
|
FfxFloat32 sw_misscount_new = FfxFloat32(FFX_Hitcounter_GetSWMisses(hitcounter));
|
|
FfxFloat32 sw_misscount_old = FfxFloat32(FFX_Hitcounter_GetOldSWMisses(hitcounter));
|
|
FfxBoolean new_class = IsSW(sw_hitcount_new, sw_misscount_new, rnd);
|
|
FfxBoolean old_class = IsSW(sw_hitcount_old, sw_misscount_old, rnd_last);
|
|
|
|
// To make transition less obvious we do and extra checkerboard stage
|
|
if (new_class == old_class)
|
|
{
|
|
if (new_class)
|
|
{
|
|
g_TileClass = TILE_CLASS_FULL_SW;
|
|
}
|
|
else
|
|
{
|
|
g_TileClass = TILE_CLASS_FULL_HW;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
g_TileClass = TILE_CLASS_HALF_SW;
|
|
}
|
|
sw_hitcount_old = sw_hitcount_new;
|
|
sw_misscount_old = sw_misscount_new;
|
|
StoreHitCounter(dispatch_thread_id.xy / 8,
|
|
(FfxUInt32(clamp(sw_hitcount_old, 0.0f, 255.0f)) << 8) | (FfxUInt32(clamp(sw_misscount_old, 0.0f, 255.0f)) << 24));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
g_TileClass = TILE_CLASS_FULL_SW;
|
|
}
|
|
#endif // FFX_HYBRID_REFLECTIONS
|
|
}
|
|
FFX_GROUP_MEMORY_BARRIER;
|
|
|
|
// First we figure out on a per thread basis if we need to shoot a reflection ray
|
|
FfxBoolean is_on_screen = (dispatch_thread_id.x < screen_size.x) && (dispatch_thread_id.y < screen_size.y);
|
|
// Allow for additional engine side checks. For example engines could additionally only cast reflection rays for specific depth ranges
|
|
FfxBoolean is_surface = !IsBackground(depth);
|
|
// Don't shoot a ray on very rough surfaces
|
|
FfxBoolean is_glossy_reflection = is_surface && IsGlossyReflection(roughness);
|
|
FfxBoolean needs_ray = is_on_screen && is_glossy_reflection;
|
|
|
|
// Decide which ray to keep
|
|
FfxBoolean is_base_ray = IsBaseRay(dispatch_thread_id, samples_per_quad);
|
|
FfxBoolean is_converged = true;
|
|
if (enable_temporal_variance_guided_tracing)
|
|
{
|
|
FfxFloat32x2 uv = (dispatch_thread_id + 0.5) / screen_size;
|
|
is_converged = IsConverged(dispatch_thread_id, uv);
|
|
}
|
|
|
|
needs_ray = needs_ray && (is_base_ray || !is_converged);
|
|
|
|
// Extra check for back-facing rays, fresnel, mirror etc.
|
|
if (abs(view_space_surface_normal.z) > ReflectionsBackfacingThreshold())
|
|
{
|
|
FillEnvironment(dispatch_thread_id, IBLFactor());
|
|
needs_ray = false;
|
|
}
|
|
|
|
// We need denoiser even for mirrors since ssr/hw transition ends up creating poping tile firefies.
|
|
FfxBoolean needs_denoiser = is_glossy_reflection;
|
|
|
|
// Next we have to figure out for which pixels that ray is creating the values for. Thus, if we have to copy its value horizontal, vertical or across.
|
|
FfxBoolean require_copy =
|
|
!needs_ray && needs_denoiser; // Our pixel only requires a copy if we want to run a denoiser on it but don't want to shoot a ray for it.
|
|
|
|
FfxBoolean copy_horizontal = FfxBoolean(ffxWaveXorU1(FfxUInt32(require_copy), 1)) && (samples_per_quad != 4) && is_base_ray; // QuadReadAcrossX
|
|
FfxBoolean copy_vertical = FfxBoolean(ffxWaveXorU1(FfxUInt32(require_copy), 2)) && (samples_per_quad == 1) && is_base_ray; // QuadReadAcrossY
|
|
FfxBoolean copy_diagonal = FfxBoolean(ffxWaveXorU1(FfxUInt32(require_copy), 3)) && (samples_per_quad == 1) && is_base_ray; // QuadReadAcrossDiagonal
|
|
|
|
FfxBoolean needs_sw_ray = true;
|
|
|
|
// In case there's only software rays we don't do hybridization
|
|
#ifdef FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
|
|
needs_sw_ray = needs_ray && enable_screen_space_tracing;
|
|
|
|
FfxBoolean needs_hw_ray = false;
|
|
if (enable_hw_ray_tracing && roughness < RTRoughnessThreshold())
|
|
{
|
|
FfxBoolean checkerboard = ((group_thread_id.x ^ group_thread_id.y) & 1) == 0;
|
|
needs_sw_ray = needs_sw_ray && ((g_TileClass == TILE_CLASS_FULL_SW ? true : (g_TileClass == TILE_CLASS_HALF_SW ? checkerboard : false)));
|
|
needs_hw_ray = needs_ray && !needs_sw_ray;
|
|
}
|
|
#endif // FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
|
|
|
|
FfxUInt32 local_ray_index_in_wave_sw = ffxWavePrefixCountBits(needs_sw_ray);
|
|
FfxUInt32 wave_ray_offset_in_group_sw;
|
|
FfxUInt32 wave_ray_count_sw = ffxWaveActiveCountBits(needs_sw_ray);
|
|
|
|
#ifdef FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
|
|
FfxUInt32 local_ray_index_in_wave_hw = ffxWavePrefixCountBits(needs_hw_ray);
|
|
FfxUInt32 wave_ray_count_hw = ffxWaveActiveCountBits(needs_hw_ray);
|
|
FfxUInt32 base_ray_index_hw = 0;
|
|
#endif // FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
|
|
|
|
if (is_first_lane_of_wave)
|
|
{
|
|
if (wave_ray_count_sw > 0)
|
|
{
|
|
#ifdef FFX_GLSL
|
|
wave_ray_offset_in_group_sw = FFX_ATOMIC_ADD(g_SWCount, FfxInt32(wave_ray_count_sw));
|
|
#else
|
|
InterlockedAdd(g_SWCount, wave_ray_count_sw, wave_ray_offset_in_group_sw);
|
|
#endif
|
|
}
|
|
|
|
#ifdef FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
|
|
if (wave_ray_count_hw > 0)
|
|
IncrementRayCounterHW(wave_ray_count_hw, base_ray_index_hw);
|
|
#endif // FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
|
|
}
|
|
|
|
base_ray_index_hw = ffxWaveReadLaneFirstU1(base_ray_index_hw);
|
|
wave_ray_offset_in_group_sw = ffxWaveReadLaneFirstU1(wave_ray_offset_in_group_sw);
|
|
|
|
FFX_GROUP_MEMORY_BARRIER;
|
|
if (flat_group_thread_id == 0 && g_SWCount > 0)
|
|
{
|
|
// [IMPORTANT] We need to round up to the multiple of 32 for software rays, because of the atomic increment coalescing optimization
|
|
g_SWCountTotal = g_SWCount < 32 ? 32 : (g_SWCount > 32 ? 64 : 32);
|
|
IncrementRayCounterSW(g_SWCountTotal, g_base_ray_index_sw);
|
|
}
|
|
FFX_GROUP_MEMORY_BARRIER;
|
|
|
|
if (needs_sw_ray)
|
|
{
|
|
FfxUInt32 ray_index_sw = g_base_ray_index_sw + wave_ray_offset_in_group_sw + local_ray_index_in_wave_sw;
|
|
StoreRay(ray_index_sw, dispatch_thread_id, copy_horizontal, copy_vertical, copy_diagonal);
|
|
}
|
|
|
|
#ifdef FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
|
|
else if (needs_hw_ray)
|
|
{
|
|
FfxUInt32 ray_index_hw = base_ray_index_hw + local_ray_index_in_wave_hw;
|
|
StoreRayHW(ray_index_hw, dispatch_thread_id, copy_horizontal, copy_vertical, copy_diagonal);
|
|
}
|
|
#endif // FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
|
|
|
|
if (flat_group_thread_id < g_SWCountTotal - g_SWCount)
|
|
{
|
|
// [IMPORTANT] We need to round up to the multiple of 32 for software rays, because of the atomic increment coalescing optimization
|
|
// Emit helper(dead) lanes to fill up 32 lanes per 8x8 tile
|
|
FfxUInt32 ray_index_sw = g_base_ray_index_sw + g_SWCount + flat_group_thread_id;
|
|
StoreRaySWHelper(ray_index_sw);
|
|
}
|
|
|
|
// We only need denoiser if we trace any rays in the tile
|
|
if (is_first_lane_of_wave && (wave_ray_count_sw > 0
|
|
#ifdef FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
|
|
|| wave_ray_count_hw > 0
|
|
#endif // FFX_CLASSIFIER_CLASSIFICATION_HW_RAYTRACING_ENABLED
|
|
))
|
|
{
|
|
FFX_ATOMIC_ADD(g_TileCount, 1);
|
|
}
|
|
|
|
FFX_GROUP_MEMORY_BARRIER; // Wait until all waves wrote into g_TileCount
|
|
|
|
if (g_TileCount > 0)
|
|
{
|
|
if (group_thread_id.x == 0 && group_thread_id.y == 0)
|
|
{
|
|
FfxUInt32 tile_index;
|
|
IncrementDenoiserTileCounter(tile_index);
|
|
StoreDenoiserTile(tile_index, dispatch_thread_id);
|
|
}
|
|
}
|
|
|
|
if ((!needs_ray && !require_copy) // Discarded for some reason
|
|
|| (needs_ray && !needs_hw_ray && !needs_sw_ray) // Or needs a ray but was discarded for some other reason
|
|
)
|
|
{
|
|
if (is_surface)
|
|
{
|
|
FillEnvironment(dispatch_thread_id, IBLFactor());
|
|
}
|
|
else
|
|
ZeroBuffers(dispatch_thread_id);
|
|
}
|
|
}
|