engine/dep/include/FidelityFX/gpu/denoiser/ffx_denoiser_shadows_tileclassification.h

// This file is part of the FidelityFX SDK.
//
// Copyright (C) 2024 Advanced Micro Devices, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#ifndef FFX_DNSR_SHADOWS_TILECLASSIFICATION_HLSL
#define FFX_DNSR_SHADOWS_TILECLASSIFICATION_HLSL

#include "ffx_denoiser_shadows_util.h"

FFX_GROUPSHARED FfxInt32 g_FFX_DNSR_Shadows_false_count;
FfxBoolean FFX_DNSR_Shadows_ThreadGroupAllTrue(FfxBoolean val)
{
    const FfxUInt32 lane_count_in_thread_group = 64;
    if (ffxWaveLaneCount() == lane_count_in_thread_group)
    {
        return ffxWaveAllTrue(val);
    }
    else
    {
        FFX_GROUP_MEMORY_BARRIER;
        g_FFX_DNSR_Shadows_false_count = 0;
        FFX_GROUP_MEMORY_BARRIER;
        if (!val) g_FFX_DNSR_Shadows_false_count = 1;
        FFX_GROUP_MEMORY_BARRIER;
        return g_FFX_DNSR_Shadows_false_count == 0;
    }
}

void FFX_DNSR_Shadows_SearchSpatialRegion(FfxUInt32x2 gid, out FfxBoolean all_in_light, out FfxBoolean all_in_shadow)
{
    // The spatial passes can reach a total region of 1+2+4 = 7x7 around each block.
    // The masks are 8x4, so we need a larger vertical stride

    // Visualization - each x represents a 4x4 block, xx is one entire 8x4 mask as read from the raytracer result
    // Same for yy, these are the ones we are working on right now

    // xx xx xx
    // xx xx xx
    // xx yy xx <-- yy here is the base_tile below
    // xx yy xx
    // xx xx xx
    // xx xx xx

    // All of this should result in scalar ops
    FfxUInt32x2 base_tile = FFX_DNSR_Shadows_GetTileIndexFromPixelPosition(gid * FfxInt32x2(8, 8));

    // Load the entire region of masks in a scalar fashion
    FfxUInt32 combined_or_mask = 0;
    FfxUInt32 combined_and_mask = 0xFFFFFFFF;
    for (FfxInt32 j = -2; j <= 3; ++j)
    {
        for (FfxInt32 i = -1; i <= 1; ++i)
        {
            FfxInt32x2 tile_index = FfxInt32x2(base_tile) + FfxInt32x2(i, j);
            tile_index = clamp(tile_index, FfxInt32x2(0,0), FfxInt32x2(FFX_DNSR_Shadows_RoundedDivide(BufferDimensions().x, 8), FFX_DNSR_Shadows_RoundedDivide(BufferDimensions().y, 4)) - 1);
            const FfxUInt32 linear_tile_index = FFX_DNSR_Shadows_LinearTileIndex(tile_index, BufferDimensions().x);
            const FfxUInt32 shadow_mask = LoadRaytracedShadowMask(linear_tile_index);

            combined_or_mask = combined_or_mask | shadow_mask;
            combined_and_mask = combined_and_mask & shadow_mask;
        }
    }

    all_in_light = combined_and_mask == 0xFFFFFFFFu;
    all_in_shadow = combined_or_mask == 0u;
}

FfxFloat32 FFX_DNSR_Shadows_GetLinearDepth(FfxUInt32x2 did, FfxFloat32 depth)
{
    const FfxFloat32x2 uv = (did + 0.5f) * InvBufferDimensions();
    const FfxFloat32x2 ndc = 2.0f * FfxFloat32x2(uv.x, 1.0f - uv.y) - 1.0f;

    FfxFloat32x4 projected = FFX_MATRIX_MULTIPLY(ProjectionInverse(), FfxFloat32x4(ndc, depth, 1));

    return abs(projected.z / projected.w);
}

FfxBoolean FFX_DNSR_Shadows_IsDisoccluded(FfxUInt32x2 did, FfxFloat32 depth, FfxFloat32x2 velocity)
{
    const FfxInt32x2 dims = BufferDimensions();
    const FfxFloat32x2 texel_size = InvBufferDimensions();
    const FfxFloat32x2 uv = (did + 0.5f) * texel_size;
    const FfxFloat32x2 ndc = (2.0f * uv - 1.0f) * FfxFloat32x2(1.0f, -1.0f);
    const FfxFloat32x2 previous_uv = uv + velocity;

    FfxBoolean is_disoccluded = FFX_TRUE;

    if (all(FFX_GREATER_THAN(previous_uv, FfxFloat32x2(0,0))) && all(FFX_LESS_THAN(previous_uv, FfxFloat32x2(1,1))))
    {
        // Read the center values
        FfxFloat32x3 normal = LoadNormals(did);

        FfxFloat32x4 clip_space = FFX_MATRIX_MULTIPLY(ReprojectionMatrix(), FfxFloat32x4(ndc, depth, 1.0f));

        clip_space.z /= clip_space.w; // perspective divide

        // How aligned with the view vector? (the more Z aligned, the higher the depth errors)
        const FfxFloat32x4 homogeneous = FFX_MATRIX_MULTIPLY(ViewProjectionInverse(), FfxFloat32x4(ndc, depth, 1.0f));
        const FfxFloat32x3 world_position = FfxFloat32x3(homogeneous.xyz / homogeneous.w);  // perspective divide
        const FfxFloat32x3 view_direction = normalize(Eye().xyz - world_position);
        FfxFloat32 z_alignment = 1.0f - dot(view_direction, normal);
        z_alignment = pow(z_alignment, 8);

        // Calculate the depth difference
        FfxFloat32 linear_depth = FFX_DNSR_Shadows_GetLinearDepth(did, clip_space.z);   // get linear depth

        FfxInt32x2 idx = FfxInt32x2(previous_uv * FfxFloat32x2(dims));
        const FfxFloat32 previous_depth = FFX_DNSR_Shadows_GetLinearDepth(idx, LoadPreviousDepth(idx));
        const FfxFloat32 depth_difference = abs(previous_depth - linear_depth) / linear_depth;

        // Resolve into the disocclusion mask
        const FfxFloat32 depth_tolerance = ffxLerp(1e-2f, 1e-1f, z_alignment);
        is_disoccluded = depth_difference >= depth_tolerance;
    }

    return is_disoccluded;
}

FfxFloat32x2 FFX_DNSR_Shadows_GetClosestVelocity(FfxInt32x2 did, FfxFloat32 depth)
{
    FfxFloat32x2 closest_velocity = LoadVelocity(did);
    FfxFloat32 closest_depth = depth;

    FfxFloat32 new_depth = ffxQuadReadX(closest_depth);
    FfxFloat32x2 new_velocity = ffxQuadReadX(closest_velocity);

#if FFX_DENOISER_OPTION_INVERTED_DEPTH
    if (new_depth > closest_depth)
#else
    if (new_depth < closest_depth)
#endif
    {
        closest_depth = new_depth;
        closest_velocity = new_velocity;
    }

    new_depth = ffxQuadReadY(closest_depth);
    new_velocity = ffxQuadReadY(closest_velocity);

#if FFX_DENOISER_OPTION_INVERTED_DEPTH
    if (new_depth > closest_depth)
#else
    if (new_depth < closest_depth)
#endif
    {
        closest_depth = new_depth;
        closest_velocity = new_velocity;
    }

    return closest_velocity;
}

#define KERNEL_RADIUS 8
FfxFloat32 FFX_DNSR_Shadows_KernelWeight(FfxFloat32 i)
{
#define KERNEL_WEIGHT(i) (exp(-3.0 * FfxFloat32(i * i) / ((KERNEL_RADIUS + 1.0) * (KERNEL_RADIUS + 1.0))))

    // Statically initialize kernel_weights_sum
    FfxFloat32 kernel_weights_sum = 0;
    kernel_weights_sum += KERNEL_WEIGHT(0);
    for (FfxInt32 c = 1; c <= KERNEL_RADIUS; ++c)
    {
        kernel_weights_sum += 2 * KERNEL_WEIGHT(c); // Add other half of the kernel to the sum
    }
    FfxFloat32 inv_kernel_weights_sum = ffxReciprocal(kernel_weights_sum);

    // The only runtime code in this function
    return KERNEL_WEIGHT(i) * inv_kernel_weights_sum;
}

void FFX_DNSR_Shadows_AccumulateMoments(FfxFloat32 value, FfxFloat32 weight, inout FfxFloat32 moments)
{
    // We get value from the horizontal neighborhood calculations. Thus, it's both mean and variance due to using one sample per pixel
    moments += value * weight;
}

// The horizontal part of a 17x17 local neighborhood kernel
FfxFloat32 FFX_DNSR_Shadows_HorizontalNeighborhood(FfxInt32x2 did)
{
   const FfxInt32x2 base_did = did;

    // Prevent vertical out of bounds access
    if ((base_did.y < 0) || (base_did.y >= BufferDimensions().y)) return 0;

    const FfxUInt32x2 tile_index = FFX_DNSR_Shadows_GetTileIndexFromPixelPosition(base_did);
    const FfxUInt32 linear_tile_index = FFX_DNSR_Shadows_LinearTileIndex(tile_index, BufferDimensions().x);

    const FfxUInt32 left_tile_index = linear_tile_index - 1;
    const FfxUInt32 center_tile_index = linear_tile_index;
    const FfxUInt32 right_tile_index = linear_tile_index + 1;

    FfxBoolean is_first_tile_in_row = tile_index.x == 0;
    FfxBoolean is_last_tile_in_row = tile_index.x == (FFX_DNSR_Shadows_RoundedDivide(BufferDimensions().x, 8) - 1);

    FfxUInt32 left_tile = 0;
    if (!is_first_tile_in_row) left_tile = LoadRaytracedShadowMask(left_tile_index);
    FfxUInt32 center_tile = LoadRaytracedShadowMask(center_tile_index);
    FfxUInt32 right_tile = 0;
    if (!is_last_tile_in_row) right_tile = LoadRaytracedShadowMask(right_tile_index);

    // Construct a single FfxUInt32 with the lowest 17bits containing the horizontal part of the local neighborhood.

    // First extract the 8 bits of our row in each of the neighboring tiles
    const FfxUInt32 row_base_index = (did.y % 4) * 8;
    const FfxUInt32 left = (left_tile >> row_base_index) & 0xFF;
    const FfxUInt32 center = (center_tile >> row_base_index) & 0xFF;
    const FfxUInt32 right = (right_tile >> row_base_index) & 0xFF;

    // Combine them into a single mask containting [left, center, right] from least significant to most significant bit
    FfxUInt32 neighborhood = left | (center << 8) | (right << 16);

    // Make sure our pixel is at bit position 9 to get the highest contribution from the filter kernel
    const FfxUInt32 bit_index_in_row = (did.x % 8);
    neighborhood = neighborhood >> bit_index_in_row; // Shift out bits to the right, so the center bit ends up at bit 9.

    FfxFloat32 moment = 0.0; // For one sample per pixel this is both, mean and variance

    // First 8 bits up to the center pixel
    FfxUInt32 mask;
    FfxInt32 i;
    for (i = 0; i < 8; ++i)
    {
        mask = 1u << i;
        moment += FfxBoolean(mask & neighborhood) ? FFX_DNSR_Shadows_KernelWeight(8 - i) : 0;
    }

    // Center pixel
    mask = 1u << 8;
    moment += FfxBoolean(mask & neighborhood) ? FFX_DNSR_Shadows_KernelWeight(0) : 0;

    // Last 8 bits
    for (i = 1; i <= 8; ++i)
    {
        mask = 1u << (8 + i);
        moment += FfxBoolean(mask & neighborhood) ? FFX_DNSR_Shadows_KernelWeight(i) : 0;
    }

    return moment;
}

FFX_GROUPSHARED FfxFloat32 g_FFX_DNSR_Shadows_neighborhood[8][24];

FfxFloat32 FFX_DNSR_Shadows_ComputeLocalNeighborhood(FfxInt32x2 did, FfxInt32x2 gtid)
{
    FfxFloat32 local_neighborhood = 0;

    FfxFloat32 upper = FFX_DNSR_Shadows_HorizontalNeighborhood(FfxInt32x2(did.x, did.y - 8));
    FfxFloat32 center = FFX_DNSR_Shadows_HorizontalNeighborhood(FfxInt32x2(did.x, did.y));
    FfxFloat32 lower = FFX_DNSR_Shadows_HorizontalNeighborhood(FfxInt32x2(did.x, did.y + 8));

    g_FFX_DNSR_Shadows_neighborhood[gtid.x][gtid.y] = upper;
    g_FFX_DNSR_Shadows_neighborhood[gtid.x][gtid.y + 8] = center;
    g_FFX_DNSR_Shadows_neighborhood[gtid.x][gtid.y + 16] = lower;

    FFX_GROUP_MEMORY_BARRIER;

    // First combine the own values.
    // KERNEL_RADIUS pixels up is own upper and KERNEL_RADIUS pixels down is own lower value
    FFX_DNSR_Shadows_AccumulateMoments(center, FFX_DNSR_Shadows_KernelWeight(0), local_neighborhood);
    FFX_DNSR_Shadows_AccumulateMoments(upper, FFX_DNSR_Shadows_KernelWeight(KERNEL_RADIUS), local_neighborhood);
    FFX_DNSR_Shadows_AccumulateMoments(lower, FFX_DNSR_Shadows_KernelWeight(KERNEL_RADIUS), local_neighborhood);

    // Then read the neighboring values.
    for (FfxInt32 i = 1; i < KERNEL_RADIUS; ++i)
    {
        FfxFloat32 upper_value = g_FFX_DNSR_Shadows_neighborhood[gtid.x][8 + gtid.y - i];
        FfxFloat32 lower_value = g_FFX_DNSR_Shadows_neighborhood[gtid.x][8 + gtid.y + i];
        FfxFloat32 weight = FFX_DNSR_Shadows_KernelWeight(i);
        FFX_DNSR_Shadows_AccumulateMoments(upper_value, weight, local_neighborhood);
        FFX_DNSR_Shadows_AccumulateMoments(lower_value, weight, local_neighborhood);
    }

    return local_neighborhood;
}

void FFX_DNSR_Shadows_WriteTileMetaData(FfxUInt32x2 gid, FfxUInt32x2 gtid, FfxBoolean is_cleared, FfxBoolean all_in_light)
{
    if (all(FFX_EQUAL(gtid, FfxUInt32x2(0,0))))
    {
        FfxUInt32 light_mask = all_in_light ? TILE_META_DATA_LIGHT_MASK : 0;
        FfxUInt32 clear_mask = is_cleared ? TILE_META_DATA_CLEAR_MASK : 0;
        FfxUInt32 mask = FfxUInt32(light_mask | clear_mask);
        StoreMetadata(gid.y * FFX_DNSR_Shadows_RoundedDivide(BufferDimensions().x, 8) + gid.x, mask);
    }
}

void FFX_DNSR_Shadows_ClearTargets(FfxUInt32x2 did, FfxUInt32x2 gtid, FfxUInt32x2 gid, FfxFloat32 shadow_value, FfxBoolean is_shadow_receiver, FfxBoolean all_in_light)
{
    FFX_DNSR_Shadows_WriteTileMetaData(gid, gtid, FFX_TRUE, all_in_light);
    StoreReprojectionResults(did, FfxFloat32x2(shadow_value, 0)); // mean, variance

    FfxFloat32 temporal_sample_count = is_shadow_receiver ? 1 : 0;
    StoreMoments(did, FfxFloat32x3(shadow_value, 0, temporal_sample_count));// mean, variance, temporal sample count
}

void FFX_DNSR_Shadows_TileClassification(FfxUInt32 group_index, FfxUInt32x2 gid)
{
    FfxUInt32x2 gtid = ffxRemapForWaveReduction(group_index);  // Make sure we can use the QuadReadAcross intrinsics to access a 2x2 region.
    FfxUInt32x2 did = gid * 8 + gtid;

    FfxBoolean is_shadow_receiver = IsShadowReciever(did);

    FfxBoolean skip_sky = FFX_DNSR_Shadows_ThreadGroupAllTrue(!is_shadow_receiver);
    if (skip_sky)
    {
        // We have to set all resources of the tile we skipped to sensible values as neighboring active denoiser tiles might want to read them.
        FFX_DNSR_Shadows_ClearTargets(did, gtid, gid, 0, is_shadow_receiver, FFX_FALSE);
        return;
    }

    FfxBoolean all_in_light = FFX_FALSE;
    FfxBoolean all_in_shadow = FFX_FALSE;
    FFX_DNSR_Shadows_SearchSpatialRegion(gid, all_in_light, all_in_shadow);
    FfxFloat32 shadow_value = all_in_light ? 1 : 0; // Either all_in_light or all_in_shadow must be true, otherwise we would not skip the tile.

    FfxBoolean can_skip = all_in_light || all_in_shadow;
    // We have to append the entire tile if there is a single lane that we can't skip
    FfxBoolean skip_tile = FFX_DNSR_Shadows_ThreadGroupAllTrue(can_skip);
    if (skip_tile)
    {
        // We have to set all resources of the tile we skipped to sensible values as neighboring active denoiser tiles might want to read them.
        FFX_DNSR_Shadows_ClearTargets(did, gtid, gid, shadow_value, is_shadow_receiver, all_in_light);
        return;
    }

    FFX_DNSR_Shadows_WriteTileMetaData(gid, gtid, FFX_FALSE, FFX_FALSE);

    FfxFloat32 depth = LoadDepth(FfxInt32x2(did));
    const FfxFloat32x2 velocity = FFX_DNSR_Shadows_GetClosestVelocity(FfxInt32x2(did), depth); // Must happen before we deactivate lanes
    const FfxFloat32 local_neighborhood = FFX_DNSR_Shadows_ComputeLocalNeighborhood(FfxInt32x2(did), FfxInt32x2(gtid));

    const FfxFloat32x2 texel_size = InvBufferDimensions();
    const FfxFloat32x2 uv = (did.xy + 0.5f) * texel_size;
    const FfxFloat32x2 history_uv = uv + velocity;
    const FfxInt32x2 history_pos = FfxInt32x2(history_uv * BufferDimensions());

    const FfxUInt32x2 tile_index = FFX_DNSR_Shadows_GetTileIndexFromPixelPosition(FfxInt32x2(did));
    const FfxUInt32 linear_tile_index = FFX_DNSR_Shadows_LinearTileIndex(tile_index, BufferDimensions().x);

    const FfxUInt32 shadow_tile = LoadRaytracedShadowMask(linear_tile_index);

    FfxFloat32x3 moments_current = FfxFloat32x3(0,0,0);
    FfxFloat32 variance = 0;
    FfxFloat32 shadow_clamped = 0;
    if (is_shadow_receiver) // do not process sky pixels
    {
        FfxBoolean hit_light = FfxBoolean(shadow_tile & FFX_DNSR_Shadows_GetBitMaskFromPixelPosition(did));
        const FfxFloat32 shadow_current = hit_light ? 1.0 : 0.0;

        // Perform moments and variance calculations
        {
            FfxBoolean is_disoccluded = FFX_DNSR_Shadows_IsDisoccluded(did, depth, velocity);
            const FfxFloat32x3 previous_moments = is_disoccluded ? FfxFloat32x3(0.0f, 0.0f, 0.0f) // Can't trust previous moments on disocclusion
                : LoadPreviousMomentsBuffer(history_pos);

            const FfxFloat32 old_m = previous_moments.x;
            const FfxFloat32 old_s = previous_moments.y;
            const FfxFloat32 sample_count = previous_moments.z + 1.0f;
            const FfxFloat32 new_m = old_m + (shadow_current - old_m) / sample_count;
            const FfxFloat32 new_s = old_s + (shadow_current - old_m) * (shadow_current - new_m);

            variance = (sample_count > 1.0f ? new_s / (sample_count - 1.0f) : 1.0f);
            moments_current = FfxFloat32x3(new_m, new_s, sample_count);
        }

        // Retrieve local neighborhood and reproject
        {
            FfxFloat32 mean = local_neighborhood;
            FfxFloat32 spatial_variance = local_neighborhood;

            spatial_variance = max(spatial_variance - mean * mean, 0.0f);

            // Compute the clamping bounding box
            const FfxFloat32 std_deviation = sqrt(spatial_variance);
            const FfxFloat32 nmin = mean - 0.5f * std_deviation;
            const FfxFloat32 nmax = mean + 0.5f * std_deviation;

            // Clamp reprojected sample to local neighborhood
            FfxFloat32 shadow_previous = shadow_current;
            if (IsFirstFrame() == 0)
            {
                shadow_previous = LoadHistory(history_uv);
            }

            shadow_clamped = clamp(shadow_previous, nmin, nmax);

            // Reduce history weighting
            const FfxFloat32 sigma = 20.0f;
            const FfxFloat32 temporal_discontinuity = (shadow_previous - mean) / max(0.5f * std_deviation, 0.001f);
            const FfxFloat32 sample_counter_damper = exp(-temporal_discontinuity * temporal_discontinuity / sigma);
            moments_current.z *= sample_counter_damper;

            // Boost variance on first frames
            if (moments_current.z < 16.0f)
            {
                const FfxFloat32 variance_boost = max(16.0f - moments_current.z, 1.0f);
                variance = max(variance, spatial_variance);
                variance *= variance_boost;
            }
        }

        // Perform the temporal blend
        const FfxFloat32 history_weight = sqrt(max(8.0f - moments_current.z, 0.0f) / 8.0f);
        shadow_clamped = ffxLerp(shadow_clamped, shadow_current, ffxLerp(0.05f, 1.0f, history_weight));
    }

    // Output the results of the temporal pass
    StoreReprojectionResults(did.xy, FfxFloat32x2(shadow_clamped, variance));
    StoreMoments(did.xy, moments_current);
}

#endif