engine/dep/include/FidelityFX/gpu/denoiser/ffx_denoiser_reflections_reproject.h

723 lines
47 KiB
C

// This file is part of the FidelityFX SDK.
//
// Copyright (C) 2024 Advanced Micro Devices, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef FFX_DNSR_REFLECTIONS_REPROJECT
#define FFX_DNSR_REFLECTIONS_REPROJECT
#define FFX_DNSR_REFLECTIONS_ESTIMATES_LOCAL_NEIGHBORHOOD
#include "ffx_denoiser_reflections_common.h"
FFX_GROUPSHARED FfxUInt32 g_ffx_dnsr_shared_0[16][16];
FFX_GROUPSHARED FfxUInt32 g_ffx_dnsr_shared_1[16][16];
#if FFX_HALF
struct FFX_DNSR_Reflections_NeighborhoodSample {
FfxFloat16x3 radiance;
};
FFX_DNSR_Reflections_NeighborhoodSample FFX_DNSR_Reflections_LoadFromGroupSharedMemory(FfxInt32x2 idx) {
FfxUInt32x2 packed_radiance = FfxUInt32x2(g_ffx_dnsr_shared_0[idx.y][idx.x], g_ffx_dnsr_shared_1[idx.y][idx.x]);
FfxFloat16x4 unpacked_radiance = FFX_DNSR_Reflections_UnpackFloat16_4(packed_radiance);
FFX_DNSR_Reflections_NeighborhoodSample neighborSample;
neighborSample.radiance = unpacked_radiance.xyz;
return neighborSample;
}
void FFX_DNSR_Reflections_StoreInGroupSharedMemory(FfxInt32x2 group_thread_id, FfxFloat16x3 radiance) {
g_ffx_dnsr_shared_0[group_thread_id.y][group_thread_id.x] = FFX_DNSR_Reflections_PackFloat16(radiance.xy);
g_ffx_dnsr_shared_1[group_thread_id.y][group_thread_id.x] = FFX_DNSR_Reflections_PackFloat16(radiance.zz);
}
void FFX_DNSR_Reflections_StoreInGroupSharedMemory(FfxInt32x2 group_thread_id, FfxFloat16x4 radiance_variance) {
g_ffx_dnsr_shared_0[group_thread_id.y][group_thread_id.x] = FFX_DNSR_Reflections_PackFloat16(radiance_variance.xy);
g_ffx_dnsr_shared_1[group_thread_id.y][group_thread_id.x] = FFX_DNSR_Reflections_PackFloat16(radiance_variance.zw);
}
void FFX_DNSR_Reflections_InitializeGroupSharedMemory(FfxInt32x2 dispatch_thread_id, FfxInt32x2 group_thread_id, FfxInt32x2 screen_size) {
// Load 16x16 region into shared memory using 4 8x8 blocks.
FfxInt32x2 offset[4] = {FfxInt32x2(0, 0), FfxInt32x2(8, 0), FfxInt32x2(0, 8), FfxInt32x2(8, 8)};
// Intermediate storage registers to cache the result of all loads
FfxFloat16x3 radiance[4];
// Start in the upper left corner of the 16x16 region.
dispatch_thread_id -= 4;
// First store all loads in registers
for (FfxInt32 i = 0; i < 4; ++i) {
radiance[i] = FFX_DNSR_Reflections_LoadRadiance(dispatch_thread_id + offset[i]);
}
// Then move all registers to FFX_GROUPSHARED memory
for (FfxInt32 j = 0; j < 4; ++j) {
FFX_DNSR_Reflections_StoreInGroupSharedMemory(group_thread_id + offset[j], radiance[j]); // X
}
}
FfxFloat16x4 FFX_DNSR_Reflections_LoadFromGroupSharedMemoryRaw(FfxInt32x2 idx) {
FfxUInt32x2 packed_radiance = FfxUInt32x2(g_ffx_dnsr_shared_0[idx.y][idx.x], g_ffx_dnsr_shared_1[idx.y][idx.x]);
return FFX_DNSR_Reflections_UnpackFloat16_4(packed_radiance);
}
FfxFloat16 FFX_DNSR_Reflections_GetLuminanceWeight(FfxFloat16x3 val) {
FfxFloat16 luma = FFX_DNSR_Reflections_Luminance(val.xyz);
FfxFloat16 weight = FfxFloat16(max(exp(-luma * FFX_DNSR_REFLECTIONS_AVG_RADIANCE_LUMINANCE_WEIGHT), 1.0e-2));
return weight;
}
FfxFloat32x2 FFX_DNSR_Reflections_GetSurfaceReprojection(FfxInt32x2 dispatch_thread_id, FfxFloat32x2 uv, FfxFloat32x2 motion_vector) {
// Reflector position reprojection
FfxFloat32x2 history_uv = uv + motion_vector;
return history_uv;
}
FfxFloat32x2 FFX_DNSR_Reflections_GetHitPositionReprojection(FfxInt32x2 dispatch_thread_id, FfxFloat32x2 uv, FfxFloat32 reflected_ray_length) {
FfxFloat32 z = FFX_DNSR_Reflections_LoadDepth(dispatch_thread_id);
FfxFloat32x3 view_space_ray = FFX_DNSR_Reflections_ScreenSpaceToViewSpace(FfxFloat32x3(uv, z));
// We start out with reconstructing the ray length in view space.
// This includes the portion from the camera to the reflecting surface as well as the portion from the surface to the hit position.
FfxFloat32 surface_depth = length(view_space_ray);
FfxFloat32 ray_length = surface_depth + reflected_ray_length;
// We then perform a parallax correction by shooting a ray
// of the same length "straight through" the reflecting surface
// and reprojecting the tip of that ray to the previous frame.
view_space_ray /= surface_depth; // == normalize(view_space_ray)
view_space_ray *= ray_length;
FfxFloat32x3 world_hit_position =
FFX_DNSR_Reflections_ViewSpaceToWorldSpace(FfxFloat32x4(view_space_ray, 1)); // This is the "fake" hit position if we would follow the ray straight through the surface.
FfxFloat32x3 prev_hit_position = FFX_DNSR_Reflections_WorldSpaceToScreenSpacePrevious(world_hit_position);
FfxFloat32x2 history_uv = prev_hit_position.xy;
return history_uv;
}
FfxFloat16 FFX_DNSR_Reflections_GetDisocclusionFactor(FfxFloat16x3 normal, FfxFloat16x3 history_normal, FfxFloat32 linear_depth, FfxFloat32 history_linear_depth) {
FfxFloat16 factor = FfxFloat16(1.0f //
* exp(-abs(1.0f - max(FfxFloat16(0.0f), dot(normal, history_normal))) * FFX_DNSR_REFLECTIONS_DISOCCLUSION_NORMAL_WEIGHT) //
* exp(-abs(history_linear_depth - linear_depth) / linear_depth * FFX_DNSR_REFLECTIONS_DISOCCLUSION_DEPTH_WEIGHT));
return factor;
}
struct FFX_DNSR_Reflections_Moments {
FfxFloat16x3 mean;
FfxFloat16x3 variance;
};
FFX_DNSR_Reflections_Moments FFX_DNSR_Reflections_EstimateLocalNeighborhoodInGroup(FfxInt32x2 group_thread_id) {
FFX_DNSR_Reflections_Moments estimate;
estimate.mean = FfxFloat16x3(0.0f, 0.0f, 0.0f);
estimate.variance = FfxFloat16x3(0.0f, 0.0f, 0.0f);
FfxFloat16 accumulated_weight = FfxFloat16(0.0f);
for (FfxInt32 j = -FFX_DNSR_REFLECTIONS_LOCAL_NEIGHBORHOOD_RADIUS; j <= FFX_DNSR_REFLECTIONS_LOCAL_NEIGHBORHOOD_RADIUS; ++j) {
for (FfxInt32 i = -FFX_DNSR_REFLECTIONS_LOCAL_NEIGHBORHOOD_RADIUS; i <= FFX_DNSR_REFLECTIONS_LOCAL_NEIGHBORHOOD_RADIUS; ++i) {
FfxInt32x2 new_idx = group_thread_id + FfxInt32x2(i, j);
FfxFloat16x3 radiance = FFX_DNSR_Reflections_LoadFromGroupSharedMemory(new_idx).radiance;
FfxFloat16 weight = FFX_DNSR_Reflections_LocalNeighborhoodKernelWeight(FfxFloat16(i)) * FFX_DNSR_Reflections_LocalNeighborhoodKernelWeight(FfxFloat16(j));
accumulated_weight += weight;
estimate.mean += radiance * weight;
estimate.variance += radiance * radiance * weight;
}
}
estimate.mean /= accumulated_weight;
estimate.variance /= accumulated_weight;
estimate.variance = abs(estimate.variance - estimate.mean * estimate.mean);
return estimate;
}
FfxFloat32 dot2(FfxFloat32x3 a) { return dot(a, a); }
void FFX_DNSR_Reflections_PickReprojection(FfxInt32x2 dispatch_thread_id, //
FfxInt32x2 group_thread_id, //
FfxUInt32x2 screen_size, //
FfxFloat16 roughness, //
FfxFloat16 ray_length, //
out FfxFloat16 disocclusion_factor, //
out FfxFloat32x2 reprojection_uv, //
out FfxFloat16x3 reprojection) {
FFX_DNSR_Reflections_Moments local_neighborhood = FFX_DNSR_Reflections_EstimateLocalNeighborhoodInGroup(group_thread_id);
FfxFloat32x2 uv = FfxFloat32x2(dispatch_thread_id.x + 0.5, dispatch_thread_id.y + 0.5) / screen_size;
FfxFloat16x3 normal = FFX_DNSR_Reflections_LoadWorldSpaceNormal(dispatch_thread_id);
FfxFloat16x3 history_normal;
FfxFloat32 history_linear_depth;
{
const FfxFloat32x2 motion_vector = FFX_DNSR_Reflections_LoadMotionVector(dispatch_thread_id);
const FfxFloat32x2 surface_reprojection_uv = FFX_DNSR_Reflections_GetSurfaceReprojection(dispatch_thread_id, uv, motion_vector);
const FfxFloat32x2 hit_reprojection_uv = FFX_DNSR_Reflections_GetHitPositionReprojection(dispatch_thread_id, uv, ray_length);
const FfxFloat16x3 surface_normal = FFX_DNSR_Reflections_SampleWorldSpaceNormalHistory(surface_reprojection_uv);
const FfxFloat16x3 hit_normal = FFX_DNSR_Reflections_SampleWorldSpaceNormalHistory(hit_reprojection_uv);
const FfxFloat16x3 surface_history = FFX_DNSR_Reflections_SampleRadianceHistory(surface_reprojection_uv);
const FfxFloat16x3 hit_history = FFX_DNSR_Reflections_SampleRadianceHistory(hit_reprojection_uv);
const FfxFloat32 hit_normal_similarity = dot(normalize(FfxFloat32x3(hit_normal)), normalize(FfxFloat32x3(normal)));
const FfxFloat32 surface_normal_similarity = dot(normalize(FfxFloat32x3(surface_normal)), normalize(FfxFloat32x3(normal)));
const FfxFloat16 hit_roughness = FFX_DNSR_Reflections_SampleRoughnessHistory(hit_reprojection_uv);
const FfxFloat16 surface_roughness = FFX_DNSR_Reflections_SampleRoughnessHistory(surface_reprojection_uv);
// Choose reprojection uv based on similarity to the local neighborhood.
if (hit_normal_similarity > FFX_DNSR_REFLECTIONS_REPROJECTION_NORMAL_SIMILARITY_THRESHOLD // Candidate for mirror reflection parallax
&& hit_normal_similarity + 1.0e-3 > surface_normal_similarity //
&& abs(hit_roughness - roughness) < abs(surface_roughness - roughness) + 1.0e-3 //
) {
history_normal = hit_normal;
FfxFloat32 hit_history_depth = FFX_DNSR_Reflections_SampleDepthHistory(hit_reprojection_uv);
FfxFloat32 hit_history_linear_depth = FFX_DNSR_Reflections_GetLinearDepth(hit_reprojection_uv, hit_history_depth);
history_linear_depth = hit_history_linear_depth;
reprojection_uv = hit_reprojection_uv;
reprojection = hit_history;
} else {
// Reject surface reprojection based on simple distance
if (dot2(surface_history - local_neighborhood.mean) <
FFX_DNSR_REFLECTIONS_REPROJECT_SURFACE_DISCARD_VARIANCE_WEIGHT * length(local_neighborhood.variance)) {
history_normal = surface_normal;
FfxFloat32 surface_history_depth = FFX_DNSR_Reflections_SampleDepthHistory(surface_reprojection_uv);
FfxFloat32 surface_history_linear_depth = FFX_DNSR_Reflections_GetLinearDepth(surface_reprojection_uv, surface_history_depth);
history_linear_depth = surface_history_linear_depth;
reprojection_uv = surface_reprojection_uv;
reprojection = surface_history;
} else {
disocclusion_factor = FfxFloat16(0.0f);
return;
}
}
}
FfxFloat32 depth = FFX_DNSR_Reflections_LoadDepth(dispatch_thread_id);
FfxFloat32 linear_depth = FFX_DNSR_Reflections_GetLinearDepth(uv, depth);
// Determine disocclusion factor based on history
disocclusion_factor = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, history_normal, linear_depth, history_linear_depth);
if (disocclusion_factor > FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD) // Early out, good enough
return;
// Try to find the closest sample in the vicinity if we are not convinced of a disocclusion
if (disocclusion_factor < FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD) {
FfxFloat32x2 closest_uv = reprojection_uv;
FfxFloat32x2 dudv = 1.0 / FfxFloat32x2(screen_size);
const FfxInt32 search_radius = 1;
for (FfxInt32 y = -search_radius; y <= search_radius; y++) {
for (FfxInt32 x = -search_radius; x <= search_radius; x++) {
FfxFloat32x2 uv = reprojection_uv + FfxFloat32x2(x, y) * dudv;
FfxFloat16x3 history_normal = FFX_DNSR_Reflections_SampleWorldSpaceNormalHistory(uv);
FfxFloat32 history_depth = FFX_DNSR_Reflections_SampleDepthHistory(uv);
FfxFloat32 history_linear_depth = FFX_DNSR_Reflections_GetLinearDepth(uv, history_depth);
FfxFloat16 weight = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, history_normal, linear_depth, history_linear_depth);
if (weight > disocclusion_factor) {
disocclusion_factor = weight;
closest_uv = uv;
reprojection_uv = closest_uv;
}
}
}
reprojection = FFX_DNSR_Reflections_SampleRadianceHistory(reprojection_uv);
}
// Rare slow path - triggered only on the edges.
// Try to get rid of potential leaks at bilinear interpolation level.
if (disocclusion_factor < FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD) {
// If we've got a discarded history, try to construct a better sample out of 2x2 interpolation neighborhood
// Helps quite a bit on the edges in movement
FfxFloat32 uvx = ffxFract(FfxFloat32(screen_size.x) * reprojection_uv.x + 0.5);
FfxFloat32 uvy = ffxFract(FfxFloat32(screen_size.y) * reprojection_uv.y + 0.5);
FfxInt32x2 reproject_texel_coords = FfxInt32x2(screen_size * reprojection_uv - 0.5);
FfxFloat16x3 reprojection00 = FFX_DNSR_Reflections_LoadRadianceHistory(reproject_texel_coords + FfxInt32x2(0, 0));
FfxFloat16x3 reprojection10 = FFX_DNSR_Reflections_LoadRadianceHistory(reproject_texel_coords + FfxInt32x2(1, 0));
FfxFloat16x3 reprojection01 = FFX_DNSR_Reflections_LoadRadianceHistory(reproject_texel_coords + FfxInt32x2(0, 1));
FfxFloat16x3 reprojection11 = FFX_DNSR_Reflections_LoadRadianceHistory(reproject_texel_coords + FfxInt32x2(1, 1));
FfxFloat16x3 normal00 = FFX_DNSR_Reflections_LoadWorldSpaceNormalHistory(reproject_texel_coords + FfxInt32x2(0, 0));
FfxFloat16x3 normal10 = FFX_DNSR_Reflections_LoadWorldSpaceNormalHistory(reproject_texel_coords + FfxInt32x2(1, 0));
FfxFloat16x3 normal01 = FFX_DNSR_Reflections_LoadWorldSpaceNormalHistory(reproject_texel_coords + FfxInt32x2(0, 1));
FfxFloat16x3 normal11 = FFX_DNSR_Reflections_LoadWorldSpaceNormalHistory(reproject_texel_coords + FfxInt32x2(1, 1));
FfxFloat32 depth00 = FFX_DNSR_Reflections_GetLinearDepth(reprojection_uv, FFX_DNSR_Reflections_LoadDepthHistory(reproject_texel_coords + FfxInt32x2(0, 0)));
FfxFloat32 depth10 = FFX_DNSR_Reflections_GetLinearDepth(reprojection_uv, FFX_DNSR_Reflections_LoadDepthHistory(reproject_texel_coords + FfxInt32x2(1, 0)));
FfxFloat32 depth01 = FFX_DNSR_Reflections_GetLinearDepth(reprojection_uv, FFX_DNSR_Reflections_LoadDepthHistory(reproject_texel_coords + FfxInt32x2(0, 1)));
FfxFloat32 depth11 = FFX_DNSR_Reflections_GetLinearDepth(reprojection_uv, FFX_DNSR_Reflections_LoadDepthHistory(reproject_texel_coords + FfxInt32x2(1, 1)));
FfxFloat16x4 w = FfxFloat16x4(1.0f, 1.0f, 1.0f, 1.0f);
// Initialize with occlusion weights
w.x = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, normal00, linear_depth, depth00) > FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD / 2.0f ? FfxFloat16(1.0f) : FfxFloat16(0.0f);
w.y = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, normal10, linear_depth, depth10) > FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD / 2.0f ? FfxFloat16(1.0f) : FfxFloat16(0.0f);
w.z = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, normal01, linear_depth, depth01) > FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD / 2.0f ? FfxFloat16(1.0f) : FfxFloat16(0.0f);
w.w = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, normal11, linear_depth, depth11) > FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD / 2.0f ? FfxFloat16(1.0f) : FfxFloat16(0.0f);
// And then mix in bilinear weights
w.x = FfxFloat16(w.x * (1.0 - uvx) * (1.0 - uvy));
w.y = FfxFloat16(w.y * (uvx) * (1.0 - uvy));
w.z = FfxFloat16(w.z * (1.0 - uvx) * (uvy));
w.w = FfxFloat16(w.w * (uvx) * (uvy));
FfxFloat16 ws = max(w.x + w.y + w.z + w.w, FfxFloat16(1.0e-3));
// normalize
w /= ws;
FfxFloat16x3 history_normal;
FfxFloat32 history_linear_depth;
reprojection = reprojection00 * w.x + reprojection10 * w.y + reprojection01 * w.z + reprojection11 * w.w;
history_linear_depth = depth00 * w.x + depth10 * w.y + depth01 * w.z + depth11 * w.w;
history_normal = normal00 * w.x + normal10 * w.y + normal01 * w.z + normal11 * w.w;
disocclusion_factor = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, history_normal, linear_depth, history_linear_depth);
}
disocclusion_factor = disocclusion_factor < FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD ? FfxFloat16(0.0f) : disocclusion_factor;
}
void FFX_DNSR_Reflections_Reproject(FfxInt32x2 dispatch_thread_id, FfxInt32x2 group_thread_id, FfxUInt32x2 screen_size, FfxFloat32 temporal_stability_factor, FfxInt32 max_samples) {
FFX_DNSR_Reflections_InitializeGroupSharedMemory(dispatch_thread_id, group_thread_id, FfxInt32x2(screen_size));
FFX_GROUP_MEMORY_BARRIER;
group_thread_id += 4; // Center threads in FFX_GROUPSHARED memory
FfxFloat16 variance = FfxFloat16(1.0f);
FfxFloat16 num_samples = FfxFloat16(0.0f);
FfxFloat16 roughness = FFX_DNSR_Reflections_LoadRoughness(dispatch_thread_id);
FfxFloat32x3 normal = FFX_DNSR_Reflections_LoadWorldSpaceNormal(dispatch_thread_id);
FfxFloat16x3 radiance = FFX_DNSR_Reflections_LoadRadiance(dispatch_thread_id);
const FfxFloat16 ray_length = FFX_DNSR_Reflections_LoadRayLength(dispatch_thread_id);
if (FFX_DNSR_Reflections_IsGlossyReflection(roughness)) {
FfxFloat16 disocclusion_factor;
FfxFloat32x2 reprojection_uv;
FfxFloat16x3 reprojection;
FFX_DNSR_Reflections_PickReprojection(/*in*/ dispatch_thread_id,
/* in */ group_thread_id,
/* in */ screen_size,
/* in */ roughness,
/* in */ ray_length,
/* out */ disocclusion_factor,
/* out */ reprojection_uv,
/* out */ reprojection);
if ( (reprojection_uv.x > 0.0f) && (reprojection_uv.y > 0.0f) && (reprojection_uv.x < 1.0f) && (reprojection_uv.y< 1.0f)) {
FfxFloat16 prev_variance = FFX_DNSR_Reflections_SampleVarianceHistory(reprojection_uv);
num_samples = FFX_DNSR_Reflections_SampleNumSamplesHistory(reprojection_uv) * disocclusion_factor;
FfxFloat16 s_max_samples = FfxFloat16(max(8.0f, FfxFloat16(max_samples) * FFX_DNSR_REFLECTIONS_SAMPLES_FOR_ROUGHNESS(roughness)));
num_samples = min(s_max_samples, num_samples + FfxFloat16(1.0f));
FfxFloat16 new_variance = FFX_DNSR_Reflections_ComputeTemporalVariance(radiance.xyz, reprojection.xyz);
if (disocclusion_factor < FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD) {
FFX_DNSR_Reflections_StoreRadianceReprojected(dispatch_thread_id, FfxFloat16x3(0.0f, 0.0f, 0.0f));
FFX_DNSR_Reflections_StoreVariance(dispatch_thread_id, FfxFloat16(1.0f));
FFX_DNSR_Reflections_StoreNumSamples(dispatch_thread_id, FfxFloat16(1.0f));
} else {
FfxFloat16 variance_mix = ffxLerp(new_variance, prev_variance, FfxFloat16(1.0f / num_samples));
FFX_DNSR_Reflections_StoreRadianceReprojected(dispatch_thread_id, reprojection);
FFX_DNSR_Reflections_StoreVariance(dispatch_thread_id, variance_mix);
FFX_DNSR_Reflections_StoreNumSamples(dispatch_thread_id, num_samples);
// Mix in reprojection for radiance mip computation
radiance = ffxLerp(radiance, reprojection, FfxFloat16(0.3f));
}
} else {
FFX_DNSR_Reflections_StoreRadianceReprojected(dispatch_thread_id, FfxFloat16x3(0.0f, 0.0f, 0.0f));
FFX_DNSR_Reflections_StoreVariance(dispatch_thread_id, FfxFloat16(1.0f));
FFX_DNSR_Reflections_StoreNumSamples(dispatch_thread_id, FfxFloat16(1.0f));
}
}
// Downsample 8x8 -> 1 radiance using FFX_GROUPSHARED memory
// Initialize FFX_GROUPSHARED array for downsampling
FfxFloat16 weight = FFX_DNSR_Reflections_GetLuminanceWeight(radiance.xyz);
radiance.xyz *= weight;
if ( (dispatch_thread_id.x >= screen_size.x) || (dispatch_thread_id.y >= screen_size.y) || any(isinf(radiance)) || any(isnan(radiance)) || (weight > FfxFloat16(1.0e3))) {
radiance = FfxFloat16x3(0.0f, 0.0f, 0.0f);
weight = FfxFloat16(0.0f);
}
group_thread_id -= 4; // Center threads in FFX_GROUPSHARED memory
FFX_DNSR_Reflections_StoreInGroupSharedMemory(group_thread_id, FfxFloat16x4(radiance.xyz, weight));
FFX_GROUP_MEMORY_BARRIER;
for (FfxInt32 i = 2; i <= 8; i = i * 2) {
FfxInt32 ox = group_thread_id.x * i;
FfxInt32 oy = group_thread_id.y * i;
FfxInt32 ix = group_thread_id.x * i + i / 2;
FfxInt32 iy = group_thread_id.y * i + i / 2;
if (ix < 8 && iy < 8) {
FfxFloat16x4 rad_weight00 = FFX_DNSR_Reflections_LoadFromGroupSharedMemoryRaw(FfxInt32x2(ox, oy));
FfxFloat16x4 rad_weight10 = FFX_DNSR_Reflections_LoadFromGroupSharedMemoryRaw(FfxInt32x2(ox, iy));
FfxFloat16x4 rad_weight01 = FFX_DNSR_Reflections_LoadFromGroupSharedMemoryRaw(FfxInt32x2(ix, oy));
FfxFloat16x4 rad_weight11 = FFX_DNSR_Reflections_LoadFromGroupSharedMemoryRaw(FfxInt32x2(ix, iy));
FfxFloat16x4 sum = rad_weight00 + rad_weight01 + rad_weight10 + rad_weight11;
FFX_DNSR_Reflections_StoreInGroupSharedMemory(FfxInt32x2(ox, oy), sum);
}
FFX_GROUP_MEMORY_BARRIER;
}
if ((group_thread_id.x == 0) && (group_thread_id.y == 0)) {
FfxFloat16x4 sum = FFX_DNSR_Reflections_LoadFromGroupSharedMemoryRaw(FfxInt32x2(0, 0));
FfxFloat16 weight_acc = max(sum.w, FfxFloat16(1.0e-3));
FfxFloat32x3 radiance_avg = sum.xyz / weight_acc;
FFX_DNSR_Reflections_StoreAverageRadiance(dispatch_thread_id.xy / 8, FfxFloat16x3(radiance_avg));
}
}
#else // #if FFX_HALF
struct FFX_DNSR_Reflections_NeighborhoodSample {
FfxFloat32x3 radiance;
};
FFX_DNSR_Reflections_NeighborhoodSample FFX_DNSR_Reflections_LoadFromGroupSharedMemory(FfxInt32x2 idx) {
FfxUInt32x2 packed_radiance = FfxUInt32x2(g_ffx_dnsr_shared_0[idx.y][idx.x], g_ffx_dnsr_shared_1[idx.y][idx.x]);
FfxFloat32x4 unpacked_radiance = FFX_DNSR_Reflections_UnpackFloat16_4(packed_radiance);
FFX_DNSR_Reflections_NeighborhoodSample neighborSample;
neighborSample.radiance = unpacked_radiance.xyz;
return neighborSample;
}
void FFX_DNSR_Reflections_StoreInGroupSharedMemory(FfxInt32x2 group_thread_id, FfxFloat32x3 radiance) {
g_ffx_dnsr_shared_0[group_thread_id.y][group_thread_id.x] = FFX_DNSR_Reflections_PackFloat16(radiance.xy);
g_ffx_dnsr_shared_1[group_thread_id.y][group_thread_id.x] = FFX_DNSR_Reflections_PackFloat16(radiance.zz);
}
void FFX_DNSR_Reflections_StoreInGroupSharedMemory(FfxInt32x2 group_thread_id, FfxFloat32x4 radiance_variance) {
g_ffx_dnsr_shared_0[group_thread_id.y][group_thread_id.x] = FFX_DNSR_Reflections_PackFloat16(radiance_variance.xy);
g_ffx_dnsr_shared_1[group_thread_id.y][group_thread_id.x] = FFX_DNSR_Reflections_PackFloat16(radiance_variance.zw);
}
void FFX_DNSR_Reflections_InitializeGroupSharedMemory(FfxInt32x2 dispatch_thread_id, FfxInt32x2 group_thread_id, FfxInt32x2 screen_size) {
// Load 16x16 region into shared memory using 4 8x8 blocks.
FfxInt32x2 offset[4] = {FfxInt32x2(0, 0), FfxInt32x2(8, 0), FfxInt32x2(0, 8), FfxInt32x2(8, 8)};
// Intermediate storage registers to cache the result of all loads
FfxFloat32x3 radiance[4];
// Start in the upper left corner of the 16x16 region.
dispatch_thread_id -= 4;
// First store all loads in registers
for (FfxInt32 i = 0; i < 4; ++i) {
radiance[i] = FFX_DNSR_Reflections_LoadRadiance(dispatch_thread_id + offset[i]);
}
// Then move all registers to FFX_GROUPSHARED memory
for (FfxInt32 j = 0; j < 4; ++j) {
FFX_DNSR_Reflections_StoreInGroupSharedMemory(group_thread_id + offset[j], radiance[j]); // X
}
}
FfxFloat32x4 FFX_DNSR_Reflections_LoadFromGroupSharedMemoryRaw(FfxInt32x2 idx) {
FfxUInt32x2 packed_radiance = FfxUInt32x2(g_ffx_dnsr_shared_0[idx.y][idx.x], g_ffx_dnsr_shared_1[idx.y][idx.x]);
return FFX_DNSR_Reflections_UnpackFloat16_4(packed_radiance);
}
FfxFloat32 FFX_DNSR_Reflections_GetLuminanceWeight(FfxFloat32x3 val) {
FfxFloat32 luma = FFX_DNSR_Reflections_Luminance(val.xyz);
FfxFloat32 weight = FfxFloat32(max(exp(-luma * FFX_DNSR_REFLECTIONS_AVG_RADIANCE_LUMINANCE_WEIGHT), 1.0e-2));
return weight;
}
FfxFloat32x2 FFX_DNSR_Reflections_GetSurfaceReprojection(FfxInt32x2 dispatch_thread_id, FfxFloat32x2 uv, FfxFloat32x2 motion_vector) {
// Reflector position reprojection
FfxFloat32x2 history_uv = uv + motion_vector;
return history_uv;
}
FfxFloat32x2 FFX_DNSR_Reflections_GetHitPositionReprojection(FfxInt32x2 dispatch_thread_id, FfxFloat32x2 uv, FfxFloat32 reflected_ray_length) {
FfxFloat32 z = FFX_DNSR_Reflections_LoadDepth(dispatch_thread_id);
FfxFloat32x3 view_space_ray = FFX_DNSR_Reflections_ScreenSpaceToViewSpace(FfxFloat32x3(uv, z));
// We start out with reconstructing the ray length in view space.
// This includes the portion from the camera to the reflecting surface as well as the portion from the surface to the hit position.
FfxFloat32 surface_depth = length(view_space_ray);
FfxFloat32 ray_length = surface_depth + reflected_ray_length;
// We then perform a parallax correction by shooting a ray
// of the same length "straight through" the reflecting surface
// and reprojecting the tip of that ray to the previous frame.
view_space_ray /= surface_depth; // == normalize(view_space_ray)
view_space_ray *= ray_length;
FfxFloat32x3 world_hit_position =
FFX_DNSR_Reflections_ViewSpaceToWorldSpace(FfxFloat32x4(view_space_ray, 1)); // This is the "fake" hit position if we would follow the ray straight through the surface.
FfxFloat32x3 prev_hit_position = FFX_DNSR_Reflections_WorldSpaceToScreenSpacePrevious(world_hit_position);
FfxFloat32x2 history_uv = prev_hit_position.xy;
return history_uv;
}
FfxFloat32 FFX_DNSR_Reflections_GetDisocclusionFactor(FfxFloat32x3 normal, FfxFloat32x3 history_normal, FfxFloat32 linear_depth, FfxFloat32 history_linear_depth) {
FfxFloat32 factor = FfxFloat32(1.0f //
* exp(-abs(1.0f - max(FfxFloat32(0.0f), dot(normal, history_normal))) * FFX_DNSR_REFLECTIONS_DISOCCLUSION_NORMAL_WEIGHT) //
* exp(-abs(history_linear_depth - linear_depth) / linear_depth * FFX_DNSR_REFLECTIONS_DISOCCLUSION_DEPTH_WEIGHT));
return factor;
}
struct FFX_DNSR_Reflections_Moments {
FfxFloat32x3 mean;
FfxFloat32x3 variance;
};
FFX_DNSR_Reflections_Moments FFX_DNSR_Reflections_EstimateLocalNeighborhoodInGroup(FfxInt32x2 group_thread_id) {
FFX_DNSR_Reflections_Moments estimate;
estimate.mean = FfxFloat32x3(0.0f, 0.0f, 0.0f);
estimate.variance = FfxFloat32x3(0.0f, 0.0f, 0.0f);
FfxFloat32 accumulated_weight = FfxFloat32(0.0f);
for (FfxInt32 j = -FFX_DNSR_REFLECTIONS_LOCAL_NEIGHBORHOOD_RADIUS; j <= FFX_DNSR_REFLECTIONS_LOCAL_NEIGHBORHOOD_RADIUS; ++j) {
for (FfxInt32 i = -FFX_DNSR_REFLECTIONS_LOCAL_NEIGHBORHOOD_RADIUS; i <= FFX_DNSR_REFLECTIONS_LOCAL_NEIGHBORHOOD_RADIUS; ++i) {
FfxInt32x2 new_idx = group_thread_id + FfxInt32x2(i, j);
FfxFloat32x3 radiance = FFX_DNSR_Reflections_LoadFromGroupSharedMemory(new_idx).radiance;
FfxFloat32 weight = FFX_DNSR_Reflections_LocalNeighborhoodKernelWeight(FfxFloat32(i)) * FFX_DNSR_Reflections_LocalNeighborhoodKernelWeight(FfxFloat32(j));
accumulated_weight += weight;
estimate.mean += radiance * weight;
estimate.variance += radiance * radiance * weight;
}
}
estimate.mean /= accumulated_weight;
estimate.variance /= accumulated_weight;
estimate.variance = abs(estimate.variance - estimate.mean * estimate.mean);
return estimate;
}
FfxFloat32 dot2(FfxFloat32x3 a) { return dot(a, a); }
void FFX_DNSR_Reflections_PickReprojection(FfxInt32x2 dispatch_thread_id, //
FfxInt32x2 group_thread_id, //
FfxUInt32x2 screen_size, //
FfxFloat32 roughness, //
FfxFloat32 ray_length, //
out FfxFloat32 disocclusion_factor, //
out FfxFloat32x2 reprojection_uv, //
out FfxFloat32x3 reprojection) {
FFX_DNSR_Reflections_Moments local_neighborhood = FFX_DNSR_Reflections_EstimateLocalNeighborhoodInGroup(group_thread_id);
FfxFloat32x2 uv = FfxFloat32x2(dispatch_thread_id.x + 0.5, dispatch_thread_id.y + 0.5) / screen_size;
FfxFloat32x3 normal = FFX_DNSR_Reflections_LoadWorldSpaceNormal(dispatch_thread_id);
FfxFloat32x3 history_normal;
FfxFloat32 history_linear_depth;
{
const FfxFloat32x2 motion_vector = FFX_DNSR_Reflections_LoadMotionVector(dispatch_thread_id);
const FfxFloat32x2 surface_reprojection_uv = FFX_DNSR_Reflections_GetSurfaceReprojection(dispatch_thread_id, uv, motion_vector);
const FfxFloat32x2 hit_reprojection_uv = FFX_DNSR_Reflections_GetHitPositionReprojection(dispatch_thread_id, uv, ray_length);
const FfxFloat32x3 surface_normal = FFX_DNSR_Reflections_SampleWorldSpaceNormalHistory(surface_reprojection_uv);
const FfxFloat32x3 hit_normal = FFX_DNSR_Reflections_SampleWorldSpaceNormalHistory(hit_reprojection_uv);
const FfxFloat32x3 surface_history = FFX_DNSR_Reflections_SampleRadianceHistory(surface_reprojection_uv);
const FfxFloat32x3 hit_history = FFX_DNSR_Reflections_SampleRadianceHistory(hit_reprojection_uv);
const FfxFloat32 hit_normal_similarity = dot(normalize(FfxFloat32x3(hit_normal)), normalize(FfxFloat32x3(normal)));
const FfxFloat32 surface_normal_similarity = dot(normalize(FfxFloat32x3(surface_normal)), normalize(FfxFloat32x3(normal)));
const FfxFloat32 hit_roughness = FFX_DNSR_Reflections_SampleRoughnessHistory(hit_reprojection_uv);
const FfxFloat32 surface_roughness = FFX_DNSR_Reflections_SampleRoughnessHistory(surface_reprojection_uv);
// Choose reprojection uv based on similarity to the local neighborhood.
if (hit_normal_similarity > FFX_DNSR_REFLECTIONS_REPROJECTION_NORMAL_SIMILARITY_THRESHOLD // Candidate for mirror reflection parallax
&& hit_normal_similarity + 1.0e-3 > surface_normal_similarity //
&& abs(hit_roughness - roughness) < abs(surface_roughness - roughness) + 1.0e-3 //
) {
history_normal = hit_normal;
FfxFloat32 hit_history_depth = FFX_DNSR_Reflections_SampleDepthHistory(hit_reprojection_uv);
FfxFloat32 hit_history_linear_depth = FFX_DNSR_Reflections_GetLinearDepth(hit_reprojection_uv, hit_history_depth);
history_linear_depth = hit_history_linear_depth;
reprojection_uv = hit_reprojection_uv;
reprojection = hit_history;
} else {
// Reject surface reprojection based on simple distance
if (dot2(surface_history - local_neighborhood.mean) <
FFX_DNSR_REFLECTIONS_REPROJECT_SURFACE_DISCARD_VARIANCE_WEIGHT * length(local_neighborhood.variance)) {
history_normal = surface_normal;
FfxFloat32 surface_history_depth = FFX_DNSR_Reflections_SampleDepthHistory(surface_reprojection_uv);
FfxFloat32 surface_history_linear_depth = FFX_DNSR_Reflections_GetLinearDepth(surface_reprojection_uv, surface_history_depth);
history_linear_depth = surface_history_linear_depth;
reprojection_uv = surface_reprojection_uv;
reprojection = surface_history;
} else {
disocclusion_factor = FfxFloat32(0.0f);
return;
}
}
}
FfxFloat32 depth = FFX_DNSR_Reflections_LoadDepth(dispatch_thread_id);
FfxFloat32 linear_depth = FFX_DNSR_Reflections_GetLinearDepth(uv, depth);
// Determine disocclusion factor based on history
disocclusion_factor = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, history_normal, linear_depth, history_linear_depth);
if (disocclusion_factor > FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD) // Early out, good enough
return;
// Try to find the closest sample in the vicinity if we are not convinced of a disocclusion
if (disocclusion_factor < FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD) {
FfxFloat32x2 closest_uv = reprojection_uv;
FfxFloat32x2 dudv = 1.0 / FfxFloat32x2(screen_size);
const FfxInt32 search_radius = 1;
for (FfxInt32 y = -search_radius; y <= search_radius; y++) {
for (FfxInt32 x = -search_radius; x <= search_radius; x++) {
FfxFloat32x2 uv = reprojection_uv + FfxFloat32x2(x, y) * dudv;
FfxFloat32x3 history_normal = FFX_DNSR_Reflections_SampleWorldSpaceNormalHistory(uv);
FfxFloat32 history_depth = FFX_DNSR_Reflections_SampleDepthHistory(uv);
FfxFloat32 history_linear_depth = FFX_DNSR_Reflections_GetLinearDepth(uv, history_depth);
FfxFloat32 weight = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, history_normal, linear_depth, history_linear_depth);
if (weight > disocclusion_factor) {
disocclusion_factor = weight;
closest_uv = uv;
reprojection_uv = closest_uv;
}
}
}
reprojection = FFX_DNSR_Reflections_SampleRadianceHistory(reprojection_uv);
}
// Rare slow path - triggered only on the edges.
// Try to get rid of potential leaks at bilinear interpolation level.
if (disocclusion_factor < FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD) {
// If we've got a discarded history, try to construct a better sample out of 2x2 interpolation neighborhood
// Helps quite a bit on the edges in movement
FfxFloat32 uvx = ffxFract(FfxFloat32(screen_size.x) * reprojection_uv.x + 0.5);
FfxFloat32 uvy = ffxFract(FfxFloat32(screen_size.y) * reprojection_uv.y + 0.5);
FfxInt32x2 reproject_texel_coords = FfxInt32x2(screen_size * reprojection_uv - 0.5);
FfxFloat32x3 reprojection00 = FFX_DNSR_Reflections_LoadRadianceHistory(reproject_texel_coords + FfxInt32x2(0, 0));
FfxFloat32x3 reprojection10 = FFX_DNSR_Reflections_LoadRadianceHistory(reproject_texel_coords + FfxInt32x2(1, 0));
FfxFloat32x3 reprojection01 = FFX_DNSR_Reflections_LoadRadianceHistory(reproject_texel_coords + FfxInt32x2(0, 1));
FfxFloat32x3 reprojection11 = FFX_DNSR_Reflections_LoadRadianceHistory(reproject_texel_coords + FfxInt32x2(1, 1));
FfxFloat32x3 normal00 = FFX_DNSR_Reflections_LoadWorldSpaceNormalHistory(reproject_texel_coords + FfxInt32x2(0, 0));
FfxFloat32x3 normal10 = FFX_DNSR_Reflections_LoadWorldSpaceNormalHistory(reproject_texel_coords + FfxInt32x2(1, 0));
FfxFloat32x3 normal01 = FFX_DNSR_Reflections_LoadWorldSpaceNormalHistory(reproject_texel_coords + FfxInt32x2(0, 1));
FfxFloat32x3 normal11 = FFX_DNSR_Reflections_LoadWorldSpaceNormalHistory(reproject_texel_coords + FfxInt32x2(1, 1));
FfxFloat32 depth00 = FFX_DNSR_Reflections_GetLinearDepth(reprojection_uv, FFX_DNSR_Reflections_LoadDepthHistory(reproject_texel_coords + FfxInt32x2(0, 0)));
FfxFloat32 depth10 = FFX_DNSR_Reflections_GetLinearDepth(reprojection_uv, FFX_DNSR_Reflections_LoadDepthHistory(reproject_texel_coords + FfxInt32x2(1, 0)));
FfxFloat32 depth01 = FFX_DNSR_Reflections_GetLinearDepth(reprojection_uv, FFX_DNSR_Reflections_LoadDepthHistory(reproject_texel_coords + FfxInt32x2(0, 1)));
FfxFloat32 depth11 = FFX_DNSR_Reflections_GetLinearDepth(reprojection_uv, FFX_DNSR_Reflections_LoadDepthHistory(reproject_texel_coords + FfxInt32x2(1, 1)));
FfxFloat32x4 w = FfxFloat32x4(1.0f, 1.0f, 1.0f, 1.0f);
// Initialize with occlusion weights
w.x = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, normal00, linear_depth, depth00) > FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD / 2.0f ? FfxFloat32(1.0f) : FfxFloat32(0.0f);
w.y = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, normal10, linear_depth, depth10) > FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD / 2.0f ? FfxFloat32(1.0f) : FfxFloat32(0.0f);
w.z = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, normal01, linear_depth, depth01) > FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD / 2.0f ? FfxFloat32(1.0f) : FfxFloat32(0.0f);
w.w = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, normal11, linear_depth, depth11) > FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD / 2.0f ? FfxFloat32(1.0f) : FfxFloat32(0.0f);
// And then mix in bilinear weights
w.x = FfxFloat32(w.x * (1.0 - uvx) * (1.0 - uvy));
w.y = FfxFloat32(w.y * (uvx) * (1.0 - uvy));
w.z = FfxFloat32(w.z * (1.0 - uvx) * (uvy));
w.w = FfxFloat32(w.w * (uvx) * (uvy));
FfxFloat32 ws = max(w.x + w.y + w.z + w.w, FfxFloat32(1.0e-3));
// normalize
w /= ws;
FfxFloat32x3 history_normal;
FfxFloat32 history_linear_depth;
reprojection = reprojection00 * w.x + reprojection10 * w.y + reprojection01 * w.z + reprojection11 * w.w;
history_linear_depth = depth00 * w.x + depth10 * w.y + depth01 * w.z + depth11 * w.w;
history_normal = normal00 * w.x + normal10 * w.y + normal01 * w.z + normal11 * w.w;
disocclusion_factor = FFX_DNSR_Reflections_GetDisocclusionFactor(normal, history_normal, linear_depth, history_linear_depth);
}
disocclusion_factor = disocclusion_factor < FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD ? FfxFloat32(0.0f) : disocclusion_factor;
}
void FFX_DNSR_Reflections_Reproject(FfxInt32x2 dispatch_thread_id, FfxInt32x2 group_thread_id, FfxUInt32x2 screen_size, FfxFloat32 temporal_stability_factor, FfxInt32 max_samples) {
FFX_DNSR_Reflections_InitializeGroupSharedMemory(dispatch_thread_id, group_thread_id, FfxInt32x2(screen_size));
FFX_GROUP_MEMORY_BARRIER;
group_thread_id += 4; // Center threads in FFX_GROUPSHARED memory
FfxFloat32 variance = FfxFloat32(1.0f);
FfxFloat32 num_samples = FfxFloat32(0.0f);
FfxFloat32 roughness = FFX_DNSR_Reflections_LoadRoughness(dispatch_thread_id);
FfxFloat32x3 normal = FFX_DNSR_Reflections_LoadWorldSpaceNormal(dispatch_thread_id);
FfxFloat32x3 radiance = FFX_DNSR_Reflections_LoadRadiance(dispatch_thread_id);
const FfxFloat32 ray_length = FFX_DNSR_Reflections_LoadRayLength(dispatch_thread_id);
if (FFX_DNSR_Reflections_IsGlossyReflection(roughness)) {
FfxFloat32 disocclusion_factor;
FfxFloat32x2 reprojection_uv;
FfxFloat32x3 reprojection;
FFX_DNSR_Reflections_PickReprojection(/*in*/ dispatch_thread_id,
/* in */ group_thread_id,
/* in */ screen_size,
/* in */ roughness,
/* in */ ray_length,
/* out */ disocclusion_factor,
/* out */ reprojection_uv,
/* out */ reprojection);
if ( (reprojection_uv.x > 0.0f) && (reprojection_uv.y > 0.0f) && (reprojection_uv.x < 1.0f) && (reprojection_uv.y< 1.0f)) {
FfxFloat32 prev_variance = FFX_DNSR_Reflections_SampleVarianceHistory(reprojection_uv);
num_samples = FFX_DNSR_Reflections_SampleNumSamplesHistory(reprojection_uv) * disocclusion_factor;
FfxFloat32 s_max_samples = FfxFloat32(max(8.0f, max_samples * FFX_DNSR_REFLECTIONS_SAMPLES_FOR_ROUGHNESS(roughness)));
num_samples = min(s_max_samples, num_samples + FfxFloat32(1.0f));
FfxFloat32 new_variance = FFX_DNSR_Reflections_ComputeTemporalVariance(radiance.xyz, reprojection.xyz);
if (disocclusion_factor < FFX_DNSR_REFLECTIONS_DISOCCLUSION_THRESHOLD) {
FFX_DNSR_Reflections_StoreRadianceReprojected(dispatch_thread_id, FfxFloat32x3(0.0f, 0.0f, 0.0f));
FFX_DNSR_Reflections_StoreVariance(dispatch_thread_id, FfxFloat32(1.0f));
FFX_DNSR_Reflections_StoreNumSamples(dispatch_thread_id, FfxFloat32(1.0f));
} else {
FfxFloat32 variance_mix = ffxLerp(new_variance, prev_variance, FfxFloat32(1.0f / num_samples));
FFX_DNSR_Reflections_StoreRadianceReprojected(dispatch_thread_id, reprojection);
FFX_DNSR_Reflections_StoreVariance(dispatch_thread_id, variance_mix);
FFX_DNSR_Reflections_StoreNumSamples(dispatch_thread_id, num_samples);
// Mix in reprojection for radiance mip computation
radiance = ffxLerp(radiance, reprojection, FfxFloat32(0.3f));
}
} else {
FFX_DNSR_Reflections_StoreRadianceReprojected(dispatch_thread_id, FfxFloat32x3(0.0f, 0.0f, 0.0f));
FFX_DNSR_Reflections_StoreVariance(dispatch_thread_id, FfxFloat32(1.0f));
FFX_DNSR_Reflections_StoreNumSamples(dispatch_thread_id, FfxFloat32(1.0f));
}
}
// Downsample 8x8 -> 1 radiance using FFX_GROUPSHARED memory
// Initialize FFX_GROUPSHARED array for downsampling
FfxFloat32 weight = FFX_DNSR_Reflections_GetLuminanceWeight(radiance.xyz);
radiance.xyz *= weight;
if ( (dispatch_thread_id.x >= screen_size.x) || (dispatch_thread_id.y >= screen_size.y) || any(isinf(radiance)) || any(isnan(radiance)) || (weight > FfxFloat32(1.0e3))) {
radiance = FfxFloat32x3(0.0f, 0.0f, 0.0f);
weight = FfxFloat32(0.0f);
}
group_thread_id -= 4; // Center threads in FFX_GROUPSHARED memory
FFX_DNSR_Reflections_StoreInGroupSharedMemory(group_thread_id, FfxFloat32x4(radiance.xyz, weight));
FFX_GROUP_MEMORY_BARRIER;
for (FfxInt32 i = 2; i <= 8; i = i * 2) {
FfxInt32 ox = group_thread_id.x * i;
FfxInt32 oy = group_thread_id.y * i;
FfxInt32 ix = group_thread_id.x * i + i / 2;
FfxInt32 iy = group_thread_id.y * i + i / 2;
if (ix < 8 && iy < 8) {
FfxFloat32x4 rad_weight00 = FFX_DNSR_Reflections_LoadFromGroupSharedMemoryRaw(FfxInt32x2(ox, oy));
FfxFloat32x4 rad_weight10 = FFX_DNSR_Reflections_LoadFromGroupSharedMemoryRaw(FfxInt32x2(ox, iy));
FfxFloat32x4 rad_weight01 = FFX_DNSR_Reflections_LoadFromGroupSharedMemoryRaw(FfxInt32x2(ix, oy));
FfxFloat32x4 rad_weight11 = FFX_DNSR_Reflections_LoadFromGroupSharedMemoryRaw(FfxInt32x2(ix, iy));
FfxFloat32x4 sum = rad_weight00 + rad_weight01 + rad_weight10 + rad_weight11;
FFX_DNSR_Reflections_StoreInGroupSharedMemory(FfxInt32x2(ox, oy), sum);
}
FFX_GROUP_MEMORY_BARRIER;
}
if ((group_thread_id.x == 0) && (group_thread_id.y == 0)) {
FfxFloat32x4 sum = FFX_DNSR_Reflections_LoadFromGroupSharedMemoryRaw(FfxInt32x2(0, 0));
FfxFloat32 weight_acc = max(sum.w, FfxFloat32(1.0e-3));
FfxFloat32x3 radiance_avg = sum.xyz / weight_acc;
FFX_DNSR_Reflections_StoreAverageRadiance(dispatch_thread_id.xy / 8, FfxFloat32x3(radiance_avg));
}
}
#endif // #if FFX_HALF
void Reproject(FfxUInt32 group_index, FfxUInt32 group_id, FfxUInt32x2 group_thread_id) {
FfxUInt32 packed_coords = GetDenoiserTile(group_id);
FfxInt32x2 dispatch_thread_id = FfxInt32x2(packed_coords & 0xffffu, (packed_coords >> 16) & 0xffffu) + FfxInt32x2(group_thread_id);
FfxInt32x2 dispatch_group_id = dispatch_thread_id / 8;
FfxInt32x2 remapped_group_thread_id = FfxInt32x2(ffxRemapForWaveReduction(group_index));
FfxInt32x2 remapped_dispatch_thread_id = dispatch_group_id * 8 + remapped_group_thread_id;
FFX_DNSR_Reflections_Reproject(remapped_dispatch_thread_id, remapped_group_thread_id, RenderSize(), TemporalStabilityFactor(), 32);
}
#endif // FFX_DNSR_REFLECTIONS_REPROJECT