938 lines
42 KiB
C
938 lines
42 KiB
C
// This file is part of the FidelityFX SDK.
|
|
//
|
|
// Copyright (C) 2024 Advanced Micro Devices, Inc.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files(the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions :
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
|
|
#ifndef FFX_BLUR_H
|
|
#define FFX_BLUR_H
|
|
//_____________________________________________________________/\_______________________________________________________________
|
|
//==============================================================================================================================
|
|
//
|
|
// [FFX BLUR] Blur
|
|
//
|
|
//==============================================================================================================================
|
|
//
|
|
/// @defgroup FfxGPUBlur FidelityFX BLUR
|
|
/// FidelityFX Blur GPU documentation
|
|
///
|
|
/// @ingroup FfxGPUEffects
|
|
|
|
//------------------------------------------------------------------------------------------------------------------------------
|
|
//
|
|
// ABOUT
|
|
// =====
|
|
// AMD FidelityFX Blur is a collection of blurring effects implemented on compute shaders, hand-optimized for maximum performance.
|
|
// FFX-Blur includes
|
|
// - Gaussian Blur w/ large kernel support (up to 21x21)
|
|
//
|
|
//==============================================================================================================================
|
|
|
|
//==============================================================================================================================
|
|
// BLUR SETUP
|
|
//==============================================================================================================================
|
|
|
|
|
|
/// FFX_BLUR_TILE_SIZE_Y: Tile Y dimensions that the local threadgroup will work on.
|
|
/// Note: each threadgroup is responsible for blurring a tile of the input image.
|
|
///
|
|
/// @ingroup FfxGPUBlur
|
|
#ifndef FFX_BLUR_TILE_SIZE_Y
|
|
#define FFX_BLUR_TILE_SIZE_Y 8
|
|
#endif
|
|
|
|
/// FFX_BLUR_TILE_SIZE_X: Tile X dimensions that the local threadgroup will work on.
|
|
/// Note: each threadgroup is responsible for blurring a tile of the input image.
|
|
///
|
|
/// @ingroup FfxGPUBlur
|
|
#ifndef FFX_BLUR_TILE_SIZE_X
|
|
#define FFX_BLUR_TILE_SIZE_X 8
|
|
#endif
|
|
|
|
/// FFX_BLUR_DISPATCH_Y: Y dimension of the Blur compute dispatch.
|
|
/// The compute dispatch on the CPU side uses this value when invoking Dispatch.
|
|
///
|
|
/// @ingroup FfxGPUBlur
|
|
#define FFX_BLUR_DISPATCH_Y 8
|
|
|
|
#ifndef FFX_CPU
|
|
|
|
/// FFX_BLUR_OPTION_KERNEL_DIMENSION needs to be defined by the client application
|
|
/// App should define e.g the following for 5x5 blur:
|
|
/// #define FFX_BLUR_OPTION_KERNEL_DIMENSION 5
|
|
///
|
|
/// @ingroup FfxGPUBlur
|
|
#ifndef FFX_BLUR_OPTION_KERNEL_DIMENSION
|
|
#error Please define FFX_BLUR_OPTION_KERNEL_DIMENSION
|
|
#endif
|
|
|
|
/// FFX_BLUR_KERNEL_RANGE is defined relative to FFX_BLUR_OPTION_KERNEL_DIMENSION
|
|
/// See ffx_blur_callbacks_*.h for details.
|
|
///
|
|
/// @ingroup FfxGPUBlur
|
|
#ifndef FFX_BLUR_KERNEL_RANGE
|
|
#error Please define FFX_BLUR_KERNEL_RANGE
|
|
#endif
|
|
|
|
//--------------------------------------------------------------------------------------
|
|
// BLUR CONFIG
|
|
//--------------------------------------------------------------------------------------
|
|
// hardcoded variants
|
|
#define BLUR_DEBUG_PREFILL_OUTPUT_CACHE_WITH_COLOR 0
|
|
#define BLUR_GROUPSHARED_MEMORY_SOA 0 // [Deprecated] improves LDS but too high traffic still
|
|
#define BLUR_GROUPSHARED_MEMORY_HALF 0 // [Deprecated] LOTS of LDS traffic (1x ds_read per channel), need to pack with FfxUInt32
|
|
#define BLUR_GROUPSHARED_MEMORY_PK_UINT 1 // 1x ds_read2st64_b32 for all three channels
|
|
#define BLUR_FP16_KERNEL_LOOPS 1 // use fp16 for kernel loop counters and lds indexing (increases VGPR due to sdwa)
|
|
#define BLUR_FP16_CLAMP 1 // ensure fp16 min/max is used for clamp()
|
|
// cpu-driven variants
|
|
#ifndef BLUR_ENABLE_INPUT_CACHE
|
|
#define BLUR_ENABLE_INPUT_CACHE 0 // currently only slows the algorithm :(
|
|
#endif
|
|
#ifndef BLUR_DISABLE_CLAMP
|
|
#define BLUR_DISABLE_CLAMP 0 // Generates incorrect image at the image borders (no clamp), for testing theoretical speed
|
|
#endif
|
|
#ifndef BLUR_OPTIMIZED_CLAMP
|
|
#define BLUR_OPTIMIZED_CLAMP 0 // [Experimental] Testing a new optimized clamp ISA
|
|
#endif
|
|
|
|
// constants
|
|
#define BLUR_TILE_SIZE_Y_INV (1.0 / FFX_BLUR_TILE_SIZE_Y)
|
|
|
|
//--------------------------------------------------------------------------------------
|
|
// GROUPSHARED MEMORY
|
|
//--------------------------------------------------------------------------------------
|
|
// Define CacheTypes<FP16, SOA>
|
|
#if BLUR_GROUPSHARED_MEMORY_SOA
|
|
#if BLUR_GROUPSHARED_MEMORY_HALF
|
|
#ifdef FFX_HLSL
|
|
#define BLUR_GROUPSHARED_MEMORY_TYPE groupshared FfxFloat16
|
|
#else
|
|
#define BLUR_GROUPSHARED_MEMORY_TYPE shared FfxFloat16
|
|
#endif
|
|
#else
|
|
#ifdef FFX_HLSL
|
|
#define BLUR_GROUPSHARED_MEMORY_TYPE groupshared FfxFloat32
|
|
#else
|
|
#define BLUR_GROUPSHARED_MEMORY_TYPE shared FfxFloat32
|
|
#endif
|
|
#endif
|
|
#else // BLUR_GROUPSHARED_MEMORY_SOA
|
|
#if BLUR_GROUPSHARED_MEMORY_HALF
|
|
#ifdef FFX_HLSL
|
|
#define BLUR_GROUPSHARED_MEMORY_TYPE groupshared FfxFloat16x3
|
|
#else
|
|
#define BLUR_GROUPSHARED_MEMORY_TYPE shared FfxFloat16x3
|
|
#endif
|
|
#else
|
|
#ifdef FFX_HLSL
|
|
#define BLUR_GROUPSHARED_MEMORY_TYPE groupshared FfxFloat32x3
|
|
#else
|
|
#define BLUR_GROUPSHARED_MEMORY_TYPE shared FfxFloat32x3
|
|
#endif
|
|
#endif
|
|
#endif // BLUR_GROUPSHARED_MEMORY_SOA
|
|
|
|
//==============================================================================================================================
|
|
// MATH HELPERS
|
|
//==============================================================================================================================
|
|
#define DIV_AND_ROUND_UP(x, y) (((x) + ((y)-1)) / ((y)))
|
|
// Fast modulo operator for powers of two values for Y: x % y == x & (y-1)
|
|
#define FAST_MOD(x, y) ((x) & (y - 1))
|
|
#if FFX_HALF
|
|
#define FAST_MOD16(x, y) ((x) & (y - FfxInt16(1)))
|
|
#endif
|
|
|
|
// OUTPUT CACHE ########################################
|
|
/*
|
|
# Notes from Jordan's Presentation
|
|
src: https://gpuopen.com/gdc-presentations/2019/gdc-2019-s5-blend-of-gcn-optimization-and-color-processing.pdf
|
|
Use 2^n tiles to use bitwise AND in place of the more ALU-expensive % operator, see #define FAST_MOD above
|
|
MinTiles -> Ceil(HalfKernel / TileSize) * 2 + 1
|
|
*/
|
|
#if FFX_BLUR_OPTION_KERNEL_DIMENSION > 7
|
|
#define NUM_TILES_OUTPUT_CACHE 8
|
|
#else
|
|
#define NUM_TILES_OUTPUT_CACHE 4
|
|
#endif
|
|
|
|
#define NUM_PIXELS_OUTPUT_CACHE (FFX_BLUR_TILE_SIZE_Y * FFX_BLUR_TILE_SIZE_X * NUM_TILES_OUTPUT_CACHE)
|
|
|
|
|
|
#if BLUR_GROUPSHARED_MEMORY_PK_UINT
|
|
#ifdef FFX_HLSL
|
|
groupshared FfxUInt32 OutputCacheRG[NUM_PIXELS_OUTPUT_CACHE]; // RG: 2x fp16's are packed into 32bit unsigned int
|
|
groupshared FfxFloat32 OutputCacheB [NUM_PIXELS_OUTPUT_CACHE]; // B : don't use fp16 for B to avoid bank conflicts
|
|
#else
|
|
shared FfxUInt32 OutputCacheRG[NUM_PIXELS_OUTPUT_CACHE]; // RG: 2x fp16's are packed into 32bit unsigned int
|
|
shared FfxFloat32 OutputCacheB [NUM_PIXELS_OUTPUT_CACHE]; // B : don't use fp16 for B to avoid bank conflicts
|
|
#endif
|
|
#else
|
|
#if BLUR_GROUPSHARED_MEMORY_SOA
|
|
BLUR_GROUPSHARED_MEMORY_TYPE OutputCache[NUM_PIXELS_OUTPUT_CACHE * 3]; // stores rrrrrr...ggggggg...bbbbb...
|
|
#else
|
|
BLUR_GROUPSHARED_MEMORY_TYPE OutputCache[NUM_PIXELS_OUTPUT_CACHE]; // stores rgbrgbrgbrgbrgbrgbrgbrgb...
|
|
#endif // BLUR_GROUPSHARED_MEMORY_SOA
|
|
#endif // BLUR_GROUPSHARED_MEMORY_PK_UINT
|
|
|
|
// Based on the FFX_BLUR_OPTION_KERNEL_DIMENSION, we will need to pre-fill a number of tiles.
|
|
// e.g. TILE_SIZE_Y=8
|
|
// -----------------------------------------
|
|
// kernel = 3 | NUM_PREFILL_TILES_OUTPUT_CACHE = 1 |
|
|
// kernel = 5 | NUM_PREFILL_TILES_OUTPUT_CACHE = 1 |
|
|
// kernel = 7 | NUM_PREFILL_TILES_OUTPUT_CACHE = 1 |
|
|
// kernel = 9* | NUM_PREFILL_TILES_OUTPUT_CACHE = 2*|
|
|
// kernel = 11 | NUM_PREFILL_TILES_OUTPUT_CACHE = 2 |
|
|
// kernel = 13 | NUM_PREFILL_TILES_OUTPUT_CACHE = 2 |
|
|
// kernel = 15 | NUM_PREFILL_TILES_OUTPUT_CACHE = 2 |
|
|
// kernel = 17*| NUM_PREFILL_TILES_OUTPUT_CACHE = 3*|
|
|
// kernel = 19 | NUM_PREFILL_TILES_OUTPUT_CACHE = 3 |
|
|
// kernel = 21 | NUM_PREFILL_TILES_OUTPUT_CACHE = 3 |
|
|
// kernel = 23 | NUM_PREFILL_TILES_OUTPUT_CACHE = 3 |
|
|
// -----------------------------------------
|
|
#define NUM_PREFILL_TILES_OUTPUT_CACHE DIV_AND_ROUND_UP(FFX_BLUR_OPTION_KERNEL_DIMENSION, FFX_BLUR_TILE_SIZE_Y)
|
|
|
|
|
|
// INPUT CACHE ########################################
|
|
#if BLUR_ENABLE_INPUT_CACHE
|
|
#define INPUT_CACHE_TILE_SIZE_X (FFX_BLUR_TILE_SIZE_X + FFX_BLUR_OPTION_KERNEL_DIMENSION - 1)
|
|
#define NUM_TILES_INPUT_CACHE 1
|
|
#define NUM_PIXELS_INPUT_CACHE ((INPUT_CACHE_TILE_SIZE_X * FFX_BLUR_TILE_SIZE_Y) * NUM_TILES_INPUT_CACHE)
|
|
|
|
#if BLUR_GROUPSHARED_MEMORY_PK_UINT
|
|
#ifdef FFX_HLSL
|
|
groupshared FfxUInt32 InputCacheRG[NUM_PIXELS_INPUT_CACHE]; // RG: 2x fp16's are packed into 32bit unsigned int
|
|
groupshared FfxFloat32 InputCacheB [NUM_PIXELS_INPUT_CACHE]; // B : don't use fp16 for B to avoid bank conflicts
|
|
#else
|
|
shared FfxUInt32 InputCacheRG[NUM_PIXELS_INPUT_CACHE]; // RG: 2x fp16's are packed into 32bit unsigned int
|
|
shared FfxFloat32 InputCacheB [NUM_PIXELS_INPUT_CACHE]; // B : don't use fp16 for B to avoid bank conflicts
|
|
#endif
|
|
#else
|
|
#if BLUR_GROUPSHARED_MEMORY_SOA
|
|
BLUR_GROUPSHARED_MEMORY_TYPE InputCache[NUM_PIXELS_INPUT_CACHE * 3]; // stores rrrrrr...ggggggg...bbbbb...
|
|
#else
|
|
BLUR_GROUPSHARED_MEMORY_TYPE InputCache[NUM_PIXELS_INPUT_CACHE]; // stores rgbrgbrgbrgbrgbrgbrgbrgb...
|
|
#endif // BLUR_GROUPSHARED_MEMORY_SOA
|
|
#endif //BLUR_GROUPSHARED_MEMORY_PK_UINT
|
|
#endif // BLUR_ENABLE_INPUT_CACHE
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
//==============================================================================================================================
|
|
// GROUPSHARED MEMORY MAPPING FUNCTIONS
|
|
//==============================================================================================================================
|
|
// LDS ops ----------------------------------------------------------------------------------
|
|
// LDS TILES : FFX_BLUR_TILE_SIZE_X * FFX_BLUR_TILE_SIZE_Y sized tiles
|
|
// e.g. FFX_BLUR_TILE_SIZE_X = 8
|
|
//
|
|
// <------------ FFX_BLUR_TILE_SIZE_X -------------->
|
|
// ^ OutputCache[0-7]
|
|
// | OutputCache[8-15]
|
|
// |
|
|
// | // TILE #1
|
|
// |
|
|
// |
|
|
// v OutputCache[56-63]
|
|
// <------------------------------------------------>
|
|
// |
|
|
// <------------ FFX_BLUR_TILE_SIZE_X -------------->
|
|
// ^ OutputCache[64-71]
|
|
// |
|
|
// |
|
|
// | // TILE #2
|
|
// |
|
|
// |
|
|
// v
|
|
// <------------------------------------------------>
|
|
// |
|
|
//...
|
|
FfxUInt32 PackF2(FfxFloat32x2 c) { return (ffxF32ToF16(c.r) << 16) | ffxF32ToF16(c.g); }
|
|
FfxFloat32x2 UnpackToF2(FfxUInt32 packedRG)
|
|
{
|
|
#ifdef FFX_HLSL
|
|
return f16tof32(FfxUInt32x2(packedRG >> 16, packedRG & 0xFFFF));
|
|
#else
|
|
return unpackHalf2x16(packedRG).yx;
|
|
#endif
|
|
}
|
|
#if FFX_HALF
|
|
FfxUInt32 PackH2(FfxFloat16x2 c) { return (ffxF32ToF16(FfxFloat32(c.r)) << 16) | ffxF32ToF16(FfxFloat32(c.g)); } // TODO: is there a cast fp16->FfxUInt32 and skip fp16->fp32 promotion?
|
|
FfxFloat16x2 UnpackToH2(FfxUInt32 packedRG){ return FfxFloat16x2(UnpackToF2(packedRG)); }
|
|
#endif
|
|
|
|
#ifdef FFX_HLSL
|
|
inline FfxUInt32 FlattenIndex(FfxInt32x2 Index, FfxInt32 ElementStride)
|
|
#else
|
|
FfxUInt32 FlattenIndex(FfxInt32x2 Index, FfxInt32 ElementStride)
|
|
#endif
|
|
{
|
|
return Index.x + Index.y * ElementStride;
|
|
}
|
|
|
|
#if BLUR_GROUPSHARED_MEMORY_SOA
|
|
void SetOutputCache(FfxInt32x2 index, FfxFloat32x3 value)
|
|
{
|
|
FfxInt32 iLDS = index.x + index.y * BLUR_TILE_SIZE_X;
|
|
OutputCache[iLDS + NUM_PIXELS_OUTPUT_CACHE * 0] = value.r;
|
|
OutputCache[iLDS + NUM_PIXELS_OUTPUT_CACHE * 1] = value.g;
|
|
OutputCache[iLDS + NUM_PIXELS_OUTPUT_CACHE * 2] = value.b;
|
|
}
|
|
FfxFloat32x3 GetOutputCache(FfxInt32x2 index)
|
|
{
|
|
FfxInt32 iLDS = index.x + index.y * FFX_BLUR_TILE_SIZE_X;
|
|
FfxFloat32x3 c;
|
|
c.r = OutputCache[iLDS + NUM_PIXELS_OUTPUT_CACHE * 0];
|
|
c.g = OutputCache[iLDS + NUM_PIXELS_OUTPUT_CACHE * 1];
|
|
c.b = OutputCache[iLDS + NUM_PIXELS_OUTPUT_CACHE * 2];
|
|
return c;
|
|
}
|
|
#if BLUR_ENABLE_INPUT_CACHE
|
|
void SetInputCache(FfxInt32x2 index, FfxFloat32x3 value)
|
|
{
|
|
FfxInt32 iLDS = index.x + index.y * INPUT_CACHE_TILE_SIZE_X;
|
|
InputCache[iLDS + NUM_PIXELS_INPUT_CACHE * 0] = value.r;
|
|
InputCache[iLDS + NUM_PIXELS_INPUT_CACHE * 1] = value.g;
|
|
InputCache[iLDS + NUM_PIXELS_INPUT_CACHE * 2] = value.b;
|
|
}
|
|
FfxFloat32x3 GetInputCache(FfxInt32x2 index)
|
|
{
|
|
FfxInt32 iLDS = index.x + index.y * INPUT_CACHE_TILE_SIZE_X;
|
|
FfxFloat32x3 c;
|
|
c.r = InputCache[iLDS + NUM_PIXELS_INPUT_CACHE * 0];
|
|
c.g = InputCache[iLDS + NUM_PIXELS_INPUT_CACHE * 1];
|
|
c.b = InputCache[iLDS + NUM_PIXELS_INPUT_CACHE * 2];
|
|
return c;
|
|
}
|
|
#endif // BLUR_ENABLE_INPUT_CACHE
|
|
#else
|
|
#if BLUR_GROUPSHARED_MEMORY_PK_UINT
|
|
#if FFX_HALF
|
|
void SetOutputCache(FfxInt32x2 index, FfxFloat16x3 value)
|
|
{
|
|
FfxInt32 iLDS = index.x + index.y * FFX_BLUR_TILE_SIZE_X;
|
|
OutputCacheRG[iLDS] = PackH2(value.rg);
|
|
OutputCacheB[iLDS] = value.b;
|
|
}
|
|
FfxFloat16x3 GetOutputCache(FfxInt32x2 index)
|
|
{
|
|
FfxInt32 iLDS = index.x + index.y * FFX_BLUR_TILE_SIZE_X;
|
|
FfxFloat16x2 RG = UnpackToH2(OutputCacheRG[iLDS]);
|
|
return FfxFloat16x3(RG.r, RG.g, OutputCacheB[iLDS]);
|
|
}
|
|
#else
|
|
void SetOutputCache(FfxInt32x2 index, FfxFloat32x3 value)
|
|
{
|
|
FfxInt32 iLDS = index.x + index.y * FFX_BLUR_TILE_SIZE_X;
|
|
OutputCacheRG[iLDS] = PackF2(value.rg);
|
|
OutputCacheB[iLDS] = value.b;
|
|
}
|
|
FfxFloat32x3 GetOutputCache(FfxInt32x2 index)
|
|
{
|
|
FfxInt32 iLDS = index.x + index.y * FFX_BLUR_TILE_SIZE_X;
|
|
FfxFloat32x2 RG = UnpackToF2(OutputCacheRG[iLDS]);
|
|
return FfxFloat32x3(RG.r, RG.g, OutputCacheB[iLDS]);
|
|
}
|
|
#endif // FFX_HALF
|
|
#else
|
|
#if FFX_HALF
|
|
void SetOutputCache(FfxInt32x2 index, FfxFloat16x3 value)
|
|
{
|
|
const FfxUInt32 iLDS = FlattenIndex(index, FFX_BLUR_TILE_SIZE_X);
|
|
OutputCache[iLDS] = value;
|
|
}
|
|
FfxFloat16x3 GetOutputCache(FfxInt32x2 index)
|
|
{
|
|
const FfxUInt32 iLDS = FlattenIndex(index, FFX_BLUR_TILE_SIZE_X);
|
|
return OutputCache[iLDS];
|
|
}
|
|
#else
|
|
void SetOutputCache(FfxInt32x2 index, FfxFloat32x3 value)
|
|
{
|
|
const FfxUInt32 iLDS = FlattenIndex(index, FFX_BLUR_TILE_SIZE_X);
|
|
OutputCache[iLDS] = value;
|
|
}
|
|
FfxFloat32x3 GetOutputCache(FfxInt32x2 index)
|
|
{
|
|
const FfxUInt32 iLDS = FlattenIndex(index, FFX_BLUR_TILE_SIZE_X);
|
|
return OutputCache[iLDS];
|
|
}
|
|
#endif // FFX_HALF
|
|
#endif // BLUR_GROUPSHARED_MEMORY_PK_UINT
|
|
|
|
|
|
#if BLUR_ENABLE_INPUT_CACHE
|
|
#if BLUR_GROUPSHARED_MEMORY_PK_UINT
|
|
#if FFX_HALF
|
|
void SetInputCache(FfxInt32x2 index, FfxFloat16x3 value)
|
|
{
|
|
FfxInt32 iLDS = FlattenIndex(index, INPUT_CACHE_TILE_SIZE_X);
|
|
InputCacheRG[iLDS] = PackH2(value.rg);
|
|
InputCacheB[iLDS] = value.b;
|
|
}
|
|
FfxFloat16x3 GetInputCache(FfxInt32x2 index)
|
|
{
|
|
FfxInt32 iLDS = FlattenIndex(index, INPUT_CACHE_TILE_SIZE_X);
|
|
FfxFloat16x2 RG = UnpackToH2(InputCacheRG[iLDS]);
|
|
return FfxFloat16x3(RG.r, RG.g, InputCacheB[iLDS]);
|
|
}
|
|
#else
|
|
void SetInputCache(FfxInt32x2 index, FfxFloat32x3 value)
|
|
{
|
|
//FfxInt32 iLDS = FlattenIndex(index, INPUT_CACHE_TILE_SIZE_X);
|
|
FfxInt32 iLDS = index.x + index.y * INPUT_CACHE_TILE_SIZE_X;
|
|
InputCacheRG[iLDS] = PackF2(value.rg);
|
|
InputCacheB[iLDS] = value.b;
|
|
}
|
|
FfxFloat32x3 GetInputCache(FfxInt32x2 index)
|
|
{
|
|
//FfxInt32 iLDS = FlattenIndex(index, INPUT_CACHE_TILE_SIZE_X);
|
|
FfxInt32 iLDS = index.x + index.y * INPUT_CACHE_TILE_SIZE_X;
|
|
|
|
FfxFloat32x2 RG = UnpackToF2(InputCacheRG[iLDS]);
|
|
return FfxFloat32x3(RG.r, RG.g, InputCacheB[iLDS]);
|
|
}
|
|
#endif // FFX_HALF
|
|
#else
|
|
void SetInputCache(FfxInt32x2 index, FfxFloat32x3 value) { InputCache[index.x + index.y * INPUT_CACHE_TILE_SIZE_X].rgb = value; }
|
|
FfxFloat32x3 GetInputCache(FfxInt32x2 index) { return InputCache[index.x + index.y * INPUT_CACHE_TILE_SIZE_X].rgb; }
|
|
#endif // BLUR_GROUPSHARED_MEMORY_PK_UINT
|
|
#endif // BLUR_ENABLE_INPUT_CACHE
|
|
|
|
#endif // BLUR_GROUPSHARED_MEMORY_SOA
|
|
|
|
|
|
void LDSBarrier()
|
|
{
|
|
FFX_GROUP_MEMORY_BARRIER;
|
|
}
|
|
|
|
// index of the LDS tile in the ring buffer
|
|
#if FFX_HALF
|
|
#if BLUR_FP16_KERNEL_LOOPS
|
|
FfxInt16 GetOutputCacheTile(FfxInt16 iTile) { return FAST_MOD16(iTile, FfxInt16(NUM_TILES_OUTPUT_CACHE)); }
|
|
void CacheInOutputTile(FfxInt32x2 threadID, FfxInt16 iTile, FfxFloat16x3 color)
|
|
{
|
|
FfxInt16 iLDSTile = GetOutputCacheTile(iTile);
|
|
FfxInt16x2 TileOffset = FfxInt16x2(0, FfxInt16(FFX_BLUR_TILE_SIZE_Y) * iLDSTile);
|
|
FfxInt16x2 iLDS = FfxInt16x2(threadID) + TileOffset;
|
|
SetOutputCache(iLDS, color);
|
|
}
|
|
FfxFloat16x3 LoadFromCachedOutputTile(FfxInt32x2 threadID, FfxInt16 iTile)
|
|
{
|
|
FfxInt16 iLDSTile = GetOutputCacheTile(iTile);
|
|
FfxInt16x2 TileOffset = FfxInt16x2(0, FfxInt16(FFX_BLUR_TILE_SIZE_Y) * iLDSTile);
|
|
FfxInt16x2 iLDS = FfxInt16x2(threadID) + TileOffset;
|
|
return GetOutputCache(iLDS);
|
|
}
|
|
#else // BLUR_FP16_KERNEL_LOOPS
|
|
FfxInt32 GetOutputCacheTile(FfxInt32 iTile) { return FAST_MOD(iTile, NUM_TILES_OUTPUT_CACHE); }
|
|
void CacheInOutputTile(FfxInt32x2 threadID, FfxInt32 iTile, FfxFloat16x3 color)
|
|
{
|
|
FfxInt32 iLDSTile = GetOutputCacheTile(iTile);
|
|
FfxInt32x2 TileOffset = FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iLDSTile);
|
|
FfxInt32x2 iLDS = threadID + TileOffset;
|
|
SetOutputCache(iLDS, color);
|
|
}
|
|
FfxFloat32x3 LoadFromCachedOutputTile(FfxInt32x2 threadID, FfxInt32 iTile)
|
|
{
|
|
FfxInt32 iLDSTile = GetOutputCacheTile(iTile);
|
|
FfxInt32x2 TileOffset = FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iLDSTile);
|
|
FfxInt32x2 iLDS = threadID + TileOffset;
|
|
return GetOutputCache(iLDS);
|
|
}
|
|
#endif // BLUR_FP16_KERNEL_LOOPS
|
|
|
|
#else
|
|
FfxInt32 GetOutputCacheTile(FfxInt32 iTile) { return FAST_MOD(iTile, NUM_TILES_OUTPUT_CACHE); }
|
|
void CacheInOutputTile(FfxInt32x2 threadID, FfxInt32 iTile, FfxFloat32x3 color)
|
|
{
|
|
FfxInt32 iLDSTile = GetOutputCacheTile(iTile); // map image tile to LDS ring buffered tiles
|
|
FfxInt32x2 TileOffset = FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iLDSTile); // pixel offset for this tile
|
|
FfxInt32x2 iLDS = threadID + TileOffset; // 2D LDS coord based on local thread ID
|
|
SetOutputCache(iLDS, color);
|
|
}
|
|
FfxFloat32x3 LoadFromCachedOutputTile(FfxInt32x2 threadID, FfxInt32 iTile)
|
|
{
|
|
FfxInt32 iLDSTile = GetOutputCacheTile(iTile); // map image tile to LDS ring buffered tiles
|
|
FfxInt32x2 TileOffset = FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iLDSTile); // pixel offset for this tile
|
|
FfxInt32x2 iLDS = threadID + TileOffset; // 2D LDS coord based on local thread ID
|
|
return GetOutputCache(iLDS);
|
|
}
|
|
#endif // FFX_HALF
|
|
|
|
#if BLUR_ENABLE_INPUT_CACHE
|
|
#if FFX_HALF
|
|
void CacheInInputTile(FfxInt32x2 threadID, FfxInt32 KernelOffset, FfxFloat16x3 c)
|
|
#else
|
|
void CacheInInputTile(FfxInt32x2 threadID, FfxInt32 KernelOffset, FfxFloat32x3 c)
|
|
#endif
|
|
{
|
|
FfxInt32x2 InputCacheCoord = threadID + FfxInt32x2(KernelOffset, 0) + FfxInt32x2(FFX_BLUR_KERNEL_RANGE-1, 0);
|
|
SetInputCache(InputCacheCoord, c);
|
|
}
|
|
|
|
#if FFX_HALF
|
|
FfxFloat16x3 LoadFromCachedInputTile(FfxInt32x2 threadID, FfxInt32 KernelOffset)
|
|
#else
|
|
FfxFloat32x3 LoadFromCachedInputTile(FfxInt32x2 threadID, FfxInt32 KernelOffset)
|
|
#endif
|
|
{
|
|
FfxInt32x2 InputCacheCoord = threadID + FfxInt32x2(KernelOffset,0) + FfxInt32x2(FFX_BLUR_KERNEL_RANGE-1, 0);
|
|
return GetInputCache(InputCacheCoord);
|
|
}
|
|
#endif // BLUR_ENABLE_INPUT_CACHE
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
//==============================================================================================================================
|
|
// BLUR FUNCTIONS
|
|
//==============================================================================================================================
|
|
#if FFX_HALF
|
|
FfxFloat16x3 HorizontalBlurFromTexture(FfxInt32x2 CenterPixelLocation, FfxInt32x2 ImageSize)
|
|
{
|
|
#if BLUR_FP16_CLAMP // this uses 4 less VGPRs but not faster
|
|
const FfxInt16 ImageSizeClampValueX = FfxInt16(ImageSize.x - 1);
|
|
FfxFloat16x3 BlurredImage = BlurLoadInput(FfxInt16x2(CenterPixelLocation)) * BlurLoadKernelWeight(0);
|
|
for (FfxInt32 i = 1; i < FFX_BLUR_KERNEL_RANGE; ++i)
|
|
{
|
|
FfxInt32x2 Offset = FfxInt32x2(i, 0);
|
|
FfxInt16x2 SampleCoordXX = FfxInt16x2(CenterPixelLocation.x + i, CenterPixelLocation.x - i);
|
|
#if !BLUR_DISABLE_CLAMP
|
|
SampleCoordXX = clamp(SampleCoordXX, FfxInt16x2(0, 0), FfxInt16x2(ImageSizeClampValueX, ImageSizeClampValueX));
|
|
#endif
|
|
BlurredImage += BlurLoadInput(FfxInt16x2(SampleCoordXX[0], CenterPixelLocation.y)) * BlurLoadKernelWeight(i);
|
|
BlurredImage += BlurLoadInput(FfxInt16x2(SampleCoordXX[1], CenterPixelLocation.y)) * BlurLoadKernelWeight(i);
|
|
}
|
|
#else
|
|
FfxFloat16x3 BlurredImage = FfxFloat16x3(0.f, 0.f, 0.f);
|
|
for (FfxInt32 i = -FFX_BLUR_KERNEL_RANGE + 1; i < FFX_BLUR_KERNEL_RANGE; ++i)
|
|
{
|
|
FfxInt32x2 SampleCoord = CenterPixelLocation + FfxInt32x2(i, 0); // horizontal blur
|
|
#if !BLUR_DISABLE_CLAMP
|
|
SampleCoord.x = clamp(SampleCoord.x, 0, ImageSize.x-1); // clamp
|
|
#endif
|
|
FfxFloat16x3 c = BlurLoadInput(SampleCoord);
|
|
BlurredImage += c * BlurLoadKernelWeight(abs(i));
|
|
}
|
|
#endif // BLUR_FP16_CLAMP
|
|
return BlurredImage;
|
|
}
|
|
#else // FFX_HALF
|
|
FfxFloat32x3 HorizontalBlurFromTexture(FfxInt32x2 CenterPixelLocation, FfxInt32x2 ImageSize)
|
|
{
|
|
FfxFloat32x3 BlurredImage = FfxFloat32x3(0.f, 0.f, 0.f);
|
|
for (FfxInt32 i = -FFX_BLUR_KERNEL_RANGE + 1; i < FFX_BLUR_KERNEL_RANGE; ++i)
|
|
{
|
|
FfxInt32x2 SampleCoord = CenterPixelLocation + FfxInt32x2(i, 0); // horizontal blur
|
|
#if !BLUR_DISABLE_CLAMP
|
|
SampleCoord.x = clamp(SampleCoord.x, 0, ImageSize.x-1); // clamp
|
|
#endif
|
|
FfxFloat32x3 c = BlurLoadInput(SampleCoord);
|
|
BlurredImage += c * BlurLoadKernelWeight(abs(i));
|
|
}
|
|
return BlurredImage;
|
|
}
|
|
#endif // FFX_HALF
|
|
|
|
#if BLUR_ENABLE_INPUT_CACHE
|
|
#if FFX_HALF
|
|
FfxFloat16x3 HorizontalBlurFromCachedInput(FfxInt32x2 threadID)
|
|
#else
|
|
FfxFloat32x3 HorizontalBlurFromCachedInput(FfxInt32x2 threadID)
|
|
#endif
|
|
{
|
|
#if FFX_HALF
|
|
FfxFloat16x3 BlurredImage = FfxFloat16x3(0.f, 0.f, 0.f);
|
|
#else
|
|
FfxFloat32x3 BlurredImage = FfxFloat32x3(0.f, 0.f, 0.f);
|
|
#endif // FFX_HALF
|
|
|
|
for (FfxInt32 i = -FFX_BLUR_KERNEL_RANGE + 1; i < FFX_BLUR_KERNEL_RANGE; ++i)
|
|
{
|
|
BlurredImage += LoadFromCachedInputTile(threadID, i) * BlurLoadKernelWeight(abs(i));
|
|
}
|
|
return BlurredImage;
|
|
}
|
|
#endif // BLUR_ENABLE_INPUT_CACHE
|
|
|
|
#if FFX_HALF
|
|
#if BLUR_FP16_KERNEL_LOOPS
|
|
FfxFloat16x3 VerticalBlurFromCachedOutput(FfxInt32x2 ThreadID, FfxInt32x2 WorkGroupID,FfxInt16x2 CenterPixelLocation, FfxInt16x2 ImageSize)
|
|
{
|
|
const FfxInt16x2 ImageSizeClampValueXY = ImageSize.xy - FfxInt16x2(1, 1);
|
|
const FfxUInt32 iTileCount = DIV_AND_ROUND_UP(FfxUInt32(ImageSize.y), FFX_BLUR_TILE_SIZE_Y * FFX_BLUR_DISPATCH_Y);
|
|
FfxFloat16x3 value = FfxFloat16x3(0, 0, 0);
|
|
#ifndef FFX_HLSL
|
|
// For some reason using 16 bit integer for this loop in glsl does not work. It seems to be due to the use of
|
|
// a negative value as a starting value that is compared to a positive value, which seems to incorrectly cause
|
|
// the condition to always be false.
|
|
for (FfxInt32 i = (FfxInt32(-FFX_BLUR_KERNEL_RANGE) + FfxInt32(1)); i < FfxInt32(FFX_BLUR_KERNEL_RANGE); ++i)
|
|
#else
|
|
for (FfxInt16 i = (FfxInt16(-FFX_BLUR_KERNEL_RANGE) + FfxInt16(1)); i < FfxInt16(FFX_BLUR_KERNEL_RANGE); ++i)
|
|
#endif
|
|
{
|
|
FfxInt16x2 KernelSampleLocation = CenterPixelLocation + FfxInt16x2(0, i);
|
|
#if !BLUR_DISABLE_CLAMP
|
|
KernelSampleLocation.xy = clamp(KernelSampleLocation.xy, FfxInt16x2(0, 0), ImageSizeClampValueXY);
|
|
#endif
|
|
|
|
const FfxInt16 iTile_ImageSpace = FfxInt16(KernelSampleLocation.y * BLUR_TILE_SIZE_Y_INV);
|
|
const FfxInt16 iTile = FfxInt16(iTile_ImageSpace - iTileCount * WorkGroupID.y);
|
|
FfxInt16x2 TileThreadID = FfxInt16x2(ThreadID.x, FAST_MOD16(KernelSampleLocation.y, FfxInt16(FFX_BLUR_TILE_SIZE_Y)));
|
|
|
|
FfxFloat16x3 c = LoadFromCachedOutputTile(TileThreadID, iTile);
|
|
value += c * BlurLoadKernelWeight(abs(i));
|
|
}
|
|
return value;
|
|
}
|
|
#else
|
|
FfxFloat16x3 VerticalBlurFromCachedOutput(FfxInt32x2 ThreadID, FfxInt32x2 WorkGroupID, FfxInt32x2 CenterPixelLocation, FfxInt32x2 ImageSize)
|
|
{
|
|
const FfxUInt32 iTileCount = DIV_AND_ROUND_UP(ImageSize.y, FFX_BLUR_TILE_SIZE_Y * FFX_BLUR_DISPATCH_Y);
|
|
FfxFloat16x3 value = FfxFloat16x3(0, 0, 0);
|
|
#if BLUR_FP16_CLAMP
|
|
const FfxInt16x2 ClampUpperLimitXY = ImageSize.xy - 1;
|
|
for (FfxInt16 i = -FFX_BLUR_KERNEL_RANGE + 1; i < FFX_BLUR_KERNEL_RANGE; ++i)
|
|
{
|
|
FfxInt16x2 KernelSampleLocation = CenterPixelLocation + FfxInt16x2(0, i);
|
|
#if !BLUR_DISABLE_CLAMP
|
|
#if BLUR_OPTIMIZED_CLAMP
|
|
bool bNegative = firstbithigh(KernelSampleLocation.x) == 31;
|
|
KernelSampleLocation.y = bNegative ? 0 : KernelSampleLocation.y;
|
|
#else
|
|
KernelSampleLocation.xy = clamp(KernelSampleLocation.xy, FfxInt16x2(0, 0), ClampUpperLimitXY);
|
|
#endif // BLUR_OPTIMIZED_CLAMP
|
|
#endif
|
|
const FfxInt16 iTile_ImageSpace = FfxInt16(KernelSampleLocation.y * FFX_BLUR_TILE_SIZE_Y_INV);
|
|
const FfxInt16 iTile = iTile_ImageSpace - iTileCount * WorkGroupID.y;
|
|
FfxInt16x2 TileThreadID = FfxInt16x2(ThreadID.x, FAST_MOD16(KernelSampleLocation.y, FfxInt16(FFX_BLUR_TILE_SIZE_Y)));
|
|
|
|
FfxFloat16x3 c = LoadFromCachedOutputTile(TileThreadID, iTile);
|
|
value += c * BlurLoadKernelWeight(abs(i));
|
|
}
|
|
#else
|
|
for (FfxInt32 i = -FFX_BLUR_KERNEL_RANGE + 1; i < FFX_BLUR_KERNEL_RANGE; ++i)
|
|
{
|
|
FfxInt32x2 KernelSampleLocation = CenterPixelLocation + FfxInt32x2(0, i);
|
|
#if !BLUR_DISABLE_CLAMP
|
|
#if BLUR_OPTIMIZED_CLAMP
|
|
bool bNegative = firstbithigh(KernelSampleLocation.x) == 31;
|
|
KernelSampleLocation.y = bNegative ? 0 : KernelSampleLocation.y;
|
|
#else
|
|
KernelSampleLocation.xy = clamp(KernelSampleLocation.xy, 0, ImageSize.xy-1);
|
|
#endif // BLUR_OPTIMIZED_CLAMP
|
|
#endif
|
|
|
|
const FfxInt32 iTile_ImageSpace = FfxInt32(KernelSampleLocation.y * FFX_BLUR_TILE_SIZE_Y_INV);
|
|
const FfxInt32 iTile = iTile_ImageSpace - (iTileCountPerWorkgroup * WorkGroupID.y);
|
|
|
|
FfxInt32x2 TileThreadID = FfxInt32x2(ThreadID.x, FAST_MOD(KernelSampleLocation.y, FFX_BLUR_TILE_SIZE_Y));
|
|
|
|
FfxFloat16x3 c = LoadFromCachedOutputTile(TileThreadID, iTile);
|
|
value += c * BlurLoadKernelWeight(abs(i));
|
|
}
|
|
#endif // BLUR_FP16_CLAMP
|
|
return value;
|
|
}
|
|
#endif // BLUR_FP16_KERNEL_LOOPS
|
|
#else // FFX_HALF
|
|
FfxFloat32x3 VerticalBlurFromCachedOutput(FfxInt32x2 ThreadID, FfxInt32x2 WorkGroupID,FfxInt32x2 CenterPixelLocation, FfxInt32x2 ImageSize)
|
|
{
|
|
const FfxInt32 iTileCountPerWorkgroup = DIV_AND_ROUND_UP(ImageSize.y, FFX_BLUR_TILE_SIZE_Y * FFX_BLUR_DISPATCH_Y);
|
|
|
|
FfxFloat32x3 value = FfxFloat32x3(0,0,0);
|
|
for (FfxInt32 i = -FFX_BLUR_KERNEL_RANGE + 1; i < FFX_BLUR_KERNEL_RANGE; ++i)
|
|
{
|
|
FfxInt32x2 KernelSampleLocation = CenterPixelLocation + FfxInt32x2(0, i);
|
|
#if !BLUR_DISABLE_CLAMP
|
|
KernelSampleLocation.xy = clamp(KernelSampleLocation.xy, FfxInt32x2(0, 0), ImageSize.xy-1);
|
|
#endif
|
|
|
|
// which 'global' tile in the image space
|
|
const FfxInt32 iTile_ImageSpace = FfxInt32(KernelSampleLocation.y * BLUR_TILE_SIZE_Y_INV);
|
|
|
|
// local tile in this workgroup - apply the offset to convert to local space tile coordinates
|
|
// this is needed for workgroups that have WorkgroupID.y > 0: the previous workgroup's
|
|
// tile mapping doesn't have to align with the current one's depending on the FFX_BLUR_TILE_SIZE_XY.
|
|
// e.g. WorkGroupID=1's first tile will map to 0 in local space, but could be some non-0 index
|
|
// in the local space of the previous workgroup (WorkGroupID=0).
|
|
// Not correcting for this mapping will result in a chopped image on the workgroup borders.
|
|
const FfxInt32 iTile = iTile_ImageSpace - (iTileCountPerWorkgroup * WorkGroupID.y);
|
|
|
|
FfxInt32x2 TileThreadID = FfxInt32x2(ThreadID.x, FAST_MOD(KernelSampleLocation.y, FFX_BLUR_TILE_SIZE_Y));
|
|
//FfxInt32x2 TileThreadID = FfxInt32x2(ThreadID.x, KernelSampleLocation.y % BLUR_TILE_SIZE_Y);
|
|
|
|
value += LoadFromCachedOutputTile(TileThreadID, iTile) * BlurLoadKernelWeight(abs(i));
|
|
}
|
|
return value;
|
|
}
|
|
#endif // FFX_HALF
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
//==============================================================================================================================
|
|
// INPUT/OUTPUT CACHE HELPERS
|
|
//==============================================================================================================================
|
|
//
|
|
#if BLUR_ENABLE_INPUT_CACHE
|
|
// Fills the input cache with the corresponding rgion from the image + kernel extents
|
|
void FillInputCache(in FfxInt32x2 lxy, FfxInt32x2 CenterPixelLocation, FfxInt32x2 ImageSize)
|
|
{
|
|
#if FFX_HALF
|
|
FfxFloat16x3 c = FfxFloat16x3(0, 0, 0);
|
|
#else
|
|
FfxFloat32x3 c = FfxFloat32x3(0, 0, 0);
|
|
#endif
|
|
|
|
// slide the thread group over the InputCache
|
|
const FfxInt32 iNumLoops = DIV_AND_ROUND_UP(INPUT_CACHE_TILE_SIZE_X, FFX_BLUR_TILE_SIZE_X);
|
|
for (FfxInt32 i = 0; i < iNumLoops; ++i)
|
|
{
|
|
FfxInt32x2 LDSCoord = lxy - FfxInt32x2(FFX_BLUR_KERNEL_RANGE - 1, 0) + FfxInt32x2(i * FFX_BLUR_TILE_SIZE_X, 0);
|
|
FfxInt32x2 SamplePosition = CenterPixelLocation - FfxInt32x2(FFX_BLUR_KERNEL_RANGE - 1, 0) + FfxInt32x2(i * FFX_BLUR_TILE_SIZE_X, 0);
|
|
#if !BLUR_DISABLE_CLAMP
|
|
SamplePosition.x = clamp(SamplePosition.x, 0, ImageSize.x - 1);
|
|
#endif
|
|
c = BlurLoadInput(SamplePosition);
|
|
|
|
// clamp to LDS bounds if we're on the last iteration of the loop
|
|
if (i == iNumLoops - 1)
|
|
{
|
|
// faster than 'if (LDSCoord.x < INPUT_CACHE_TILE_SIZE_X)', avoids a vmem sync at the cost of some ALU
|
|
LDSCoord.x = clamp(LDSCoord.x, -FfxInt32x2(FFX_BLUR_KERNEL_RANGE - 1, 0), (INPUT_CACHE_TILE_SIZE_X - 1) - FfxInt32x2(FFX_BLUR_KERNEL_RANGE - 1, 0));
|
|
}
|
|
CacheInInputTile(LDSCoord, 0, c);
|
|
}
|
|
LDSBarrier();
|
|
}
|
|
#endif // BLUR_ENABLE_INPUT_CACHE
|
|
// Fills the output cache with the horizontally-blurred image.
|
|
void PreFillOutputCache(in FfxInt32x2 gxy, in FfxInt32x2 lxy, in FfxInt32x2 WorkGroupID, FfxInt32x2 ImageSize)
|
|
{
|
|
#if BLUR_DEBUG_PREFILL_OUTPUT_CACHE_WITH_COLOR
|
|
FfxFloat32x3 FillColor = FfxFloat32x3(0, 0, 0); // black border color
|
|
[unroll]
|
|
for (FfxInt32 iTile = 0; iTile < NUM_TILES_OUTPUT_CACHE; ++iTile)
|
|
{
|
|
CacheInOutputTile(lxy, iTile, FillColor);
|
|
}
|
|
LDSBarrier();
|
|
#endif
|
|
|
|
// load from VMEM the first NUM_PREFILL_TILES_OUTPUT_CACHE tiles
|
|
// while doing the horizontal blur, going top down
|
|
#if FFX_HALF
|
|
for (FfxInt16 j = FfxInt16(0); j < FfxInt16(NUM_PREFILL_TILES_OUTPUT_CACHE); ++j)
|
|
#else
|
|
for (FfxInt32 j = 0; j < NUM_PREFILL_TILES_OUTPUT_CACHE; ++j)
|
|
#endif
|
|
{
|
|
const FfxInt32x2 ImageCoordinate = gxy + FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * j);
|
|
#if FFX_HALF
|
|
FfxFloat16x3 c = HorizontalBlurFromTexture(ImageCoordinate, ImageSize);
|
|
#else
|
|
FfxFloat32x3 c = HorizontalBlurFromTexture(ImageCoordinate, ImageSize);
|
|
#endif
|
|
CacheInOutputTile(lxy, j, c);
|
|
}
|
|
|
|
#if FFX_BLUR_DISPATCH_Y != 1
|
|
// for any workgroup that doesn't start frop the top of the image,
|
|
// fill the cache from the tail, going upwards in the image space
|
|
if (WorkGroupID.y != 0)
|
|
{
|
|
|
|
#if FFX_HALF
|
|
FfxFloat16x3 c = FfxFloat16x3(0, 0, 0);
|
|
#else
|
|
FfxFloat32x3 c = FfxFloat32x3(0, 0, 0);
|
|
#endif
|
|
#if FFX_HALF
|
|
for (FfxInt16 j = FfxInt16(1); j < FfxInt16(NUM_PREFILL_TILES_OUTPUT_CACHE+1 + (DIV_AND_ROUND_UP(FFX_BLUR_KERNEL_RANGE, FFX_BLUR_TILE_SIZE_Y))); ++j)
|
|
#else
|
|
for (FfxInt32 j = 1; j < NUM_PREFILL_TILES_OUTPUT_CACHE+1 + (DIV_AND_ROUND_UP(FFX_BLUR_KERNEL_RANGE, FFX_BLUR_TILE_SIZE_Y)); ++j)
|
|
#endif
|
|
{
|
|
const FfxInt32x2 ImageCoordinate = gxy - FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * j);
|
|
|
|
#if BLUR_ENABLE_INPUT_CACHE
|
|
FillInputCache(lxy, ImageCoordinate, ImageSize);
|
|
c = HorizontalBlurFromCachedInput(lxy);
|
|
#else
|
|
c = HorizontalBlurFromTexture(ImageCoordinate, ImageSize);
|
|
#endif
|
|
|
|
#if FFX_HALF
|
|
CacheInOutputTile(lxy, (FfxInt16(NUM_TILES_OUTPUT_CACHE) - j), c);
|
|
#else
|
|
CacheInOutputTile(lxy, (NUM_TILES_OUTPUT_CACHE - j), c);
|
|
#endif
|
|
}
|
|
}
|
|
#endif
|
|
|
|
LDSBarrier(); // OutputCache Sync: Read -> Write =========================================
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
//==============================================================================================================================
|
|
// BLUR GAUSSIAN BLUR ALGORITHM
|
|
//==============================================================================================================================
|
|
|
|
/// ffxBlur: The main idea of the algorithm is to utilize a number of tiles (8x8) that are cached on the groupshared memory
|
|
/// in a ring-buffer fashion to speed up texture lookups in a hand-optimized compute shader.
|
|
/// The tiles are defined by the FFX_BLUR_TILE_SIZE_X and FFX_BLUR_TILE_SIZE_Y defines, and are typically 8x8 pixels.
|
|
/// The image is horizontally blurred while being cached on the groupshared memory,
|
|
/// and when all the groupshared tiles are filled, a vertical blur pass is done on the groupshared memory
|
|
/// and the result is stored in the UAV as the final destination.
|
|
///
|
|
/// The algorithm is as follows:
|
|
/// - Pre-fill LDS with 8x8 tiles, storing vertical tiles, containing horizontally blurred color
|
|
/// - Loop until the entire image is covered:
|
|
/// - Run a vertical blur pass on the LDS and output to final destination UAV
|
|
/// - Re-fill LDS with horizontally-blurred data
|
|
/// - Finish off the remaining last row/section of the image
|
|
///
|
|
/// @param [in] GlobalThreadID The SV_DispatchThreadID.xy or gl_GlobalInvocationID.xy.
|
|
/// @param [in] WorkGroupLocalThreadID The SV_GroupThreadID.xy or gl_LocalInvocationID.xy.
|
|
/// @param [in] WorkGroupID The SV_GroupID.xy or gl_WorkGroupID.xy.
|
|
/// @param [in] ImageSize The two dimensional size of the input and output image.
|
|
///
|
|
/// @ingroup FfxGPUBlur
|
|
void ffxBlur(
|
|
in FfxInt32x2 GlobalThreadID,
|
|
in FfxInt32x2 WorkGroupLocalThreadID,
|
|
in FfxInt32x2 WorkGroupID,
|
|
FfxInt32x2 ImageSize)
|
|
{
|
|
// Each threadgroup processes a number of tiles of size FFX_BLUR_TILE_SIZE_Y * FFX_BLUR_TILE_SIZE_X
|
|
// This number depends on the image height and the vertical dimension (_Y) of the FFX_BLUR_TILE_SIZE
|
|
//const FfxUInt32 iTileCount = DIV_AND_ROUND_UP(ImageSize.y, FFX_BLUR_TILE_SIZE_Y);
|
|
const FfxUInt32 iTileCount =
|
|
DIV_AND_ROUND_UP(
|
|
ImageSize.y,
|
|
FFX_BLUR_TILE_SIZE_Y * FFX_BLUR_DISPATCH_Y);
|
|
|
|
FfxInt32x2 gxy = FfxInt32x2(
|
|
WorkGroupID.x * FFX_BLUR_TILE_SIZE_X + WorkGroupLocalThreadID.x
|
|
, WorkGroupLocalThreadID.y + WorkGroupID.y * iTileCount * FFX_BLUR_TILE_SIZE_Y
|
|
);
|
|
FfxInt32x2 lxy = WorkGroupLocalThreadID;
|
|
|
|
if (gxy.x >= ImageSize.x)
|
|
return;
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
// STEP #1
|
|
//-------------------------------------------------------------------------------------------------
|
|
// Pre-fill the output cache with a few tiles of horizontally blurred image.
|
|
// The tile count to pre-fill is a function of kernel width and TileSizeY.
|
|
PreFillOutputCache(gxy, lxy, WorkGroupID, ImageSize); // doesn't sync waves
|
|
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
// STEP #2
|
|
//-------------------------------------------------------------------------------------------------
|
|
// loop through the tiles and write out to UAV as we go from top to down
|
|
FfxInt32 iTileOutput = 0;
|
|
|
|
#if FFX_HALF
|
|
FfxFloat16x3 c = FfxFloat16x3(0, 0, 0);
|
|
#else
|
|
FfxFloat32x3 c = FfxFloat32x3(0, 0, 0);
|
|
#endif
|
|
for (; iTileOutput < iTileCount - NUM_PREFILL_TILES_OUTPUT_CACHE; ++iTileOutput)
|
|
{
|
|
// index of next tile that we'll cache the output to
|
|
// It runs ahead of the tile we will be writing out to UAV by NUM_PREFILL_TILES_OUTPUT_CACHE tiles
|
|
FfxInt32 iNextTileOutputCache = iTileOutput + NUM_PREFILL_TILES_OUTPUT_CACHE;
|
|
const FfxInt32x2 HorizontalBlurInputCoord = gxy + FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iNextTileOutputCache);
|
|
|
|
// run horizontal blur & cache the next output tile
|
|
#if BLUR_ENABLE_INPUT_CACHE
|
|
FillInputCache(lxy, HorizontalBlurInputCoord, ImageSize);
|
|
c = HorizontalBlurFromCachedInput(lxy);
|
|
#else
|
|
// Number of image_load instructions will scale with FFX_BLUR_OPTION_KERNEL_DIMENSION.
|
|
c = HorizontalBlurFromTexture(HorizontalBlurInputCoord, ImageSize);
|
|
#endif // BLUR_ENABLE_INPUT_CACHE
|
|
|
|
#if FFX_HALF
|
|
CacheInOutputTile(lxy, FfxInt16(iNextTileOutputCache), c);
|
|
#else
|
|
CacheInOutputTile(lxy, iNextTileOutputCache, c);
|
|
#endif
|
|
|
|
LDSBarrier(); // OutputCache Sync: Write -> Read =========================================
|
|
|
|
// Start writing out the pixel value which has its final value
|
|
// convolved from the pixels aready in the LDS section.
|
|
const FfxInt32x2 OutputCoord = gxy + FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iTileOutput);
|
|
#if FFX_HALF
|
|
c = VerticalBlurFromCachedOutput(lxy, WorkGroupID, FfxInt16x2(OutputCoord), FfxInt16x2(ImageSize));
|
|
#else
|
|
c = VerticalBlurFromCachedOutput(lxy, WorkGroupID, OutputCoord, ImageSize);
|
|
#endif
|
|
BlurStoreOutput(OutputCoord, c);
|
|
LDSBarrier(); // OutputCache Sync: Read -> Write =========================================
|
|
}
|
|
|
|
|
|
//-------------------------------------------------------------------------------------------------
|
|
// STEP #3
|
|
//-------------------------------------------------------------------------------------------------
|
|
// fill in the remaining last tiles (= loop for NUM_PREFILL_TILES_OUTPUT_CACHE)
|
|
for (; iTileOutput < iTileCount; ++iTileOutput)
|
|
{
|
|
const FfxInt32x2 OutputCoord = gxy + FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iTileOutput);
|
|
if (iTileOutput >= iTileCount - NUM_PREFILL_TILES_OUTPUT_CACHE)
|
|
{
|
|
FfxInt32 iNextTileOutputCache = iTileOutput + NUM_PREFILL_TILES_OUTPUT_CACHE;
|
|
const FfxInt32x2 HorizontalBlurInputCoord = gxy + FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iNextTileOutputCache);
|
|
|
|
// run horizontal blur & cache the next output tile
|
|
#if BLUR_ENABLE_INPUT_CACHE
|
|
FillInputCache(lxy, HorizontalBlurInputCoord, ImageSize);
|
|
c = HorizontalBlurFromCachedInput(lxy);
|
|
#else
|
|
// Number of image_load instructions will scale with FFX_BLUR_OPTION_KERNEL_DIMENSION.
|
|
c = HorizontalBlurFromTexture(HorizontalBlurInputCoord, ImageSize);
|
|
#endif // BLUR_ENABLE_INPUT_CACHE
|
|
|
|
#if FFX_HALF
|
|
CacheInOutputTile(lxy, FfxInt16(iNextTileOutputCache), c);
|
|
#else
|
|
CacheInOutputTile(lxy, iNextTileOutputCache, c);
|
|
#endif
|
|
|
|
LDSBarrier(); // OutputCache Sync: Write -> Read =========================================
|
|
}
|
|
#if FFX_HALF
|
|
c = VerticalBlurFromCachedOutput(lxy, WorkGroupID, FfxInt16x2(OutputCoord), FfxInt16x2(ImageSize));
|
|
#else
|
|
c = VerticalBlurFromCachedOutput(lxy, WorkGroupID, OutputCoord, ImageSize);
|
|
#endif
|
|
BlurStoreOutput(OutputCoord, c);
|
|
}
|
|
}
|
|
|
|
#endif // !FFX_CPU
|
|
#endif // FFX_BLUR_H
|