// This file is part of the FidelityFX SDK. // // Copyright (C) 2024 Advanced Micro Devices, Inc. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and /or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions : // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef FFX_BLUR_H #define FFX_BLUR_H //_____________________________________________________________/\_______________________________________________________________ //============================================================================================================================== // // [FFX BLUR] Blur // //============================================================================================================================== // /// @defgroup FfxGPUBlur FidelityFX BLUR /// FidelityFX Blur GPU documentation /// /// @ingroup FfxGPUEffects //------------------------------------------------------------------------------------------------------------------------------ // // ABOUT // ===== // AMD FidelityFX Blur is a collection of blurring effects implemented on compute shaders, hand-optimized for maximum performance. // FFX-Blur includes // - Gaussian Blur w/ large kernel support (up to 21x21) // //============================================================================================================================== //============================================================================================================================== // BLUR SETUP //============================================================================================================================== /// FFX_BLUR_TILE_SIZE_Y: Tile Y dimensions that the local threadgroup will work on. /// Note: each threadgroup is responsible for blurring a tile of the input image. /// /// @ingroup FfxGPUBlur #ifndef FFX_BLUR_TILE_SIZE_Y #define FFX_BLUR_TILE_SIZE_Y 8 #endif /// FFX_BLUR_TILE_SIZE_X: Tile X dimensions that the local threadgroup will work on. /// Note: each threadgroup is responsible for blurring a tile of the input image. /// /// @ingroup FfxGPUBlur #ifndef FFX_BLUR_TILE_SIZE_X #define FFX_BLUR_TILE_SIZE_X 8 #endif /// FFX_BLUR_DISPATCH_Y: Y dimension of the Blur compute dispatch. /// The compute dispatch on the CPU side uses this value when invoking Dispatch. /// /// @ingroup FfxGPUBlur #define FFX_BLUR_DISPATCH_Y 8 #ifndef FFX_CPU /// FFX_BLUR_OPTION_KERNEL_DIMENSION needs to be defined by the client application /// App should define e.g the following for 5x5 blur: /// #define FFX_BLUR_OPTION_KERNEL_DIMENSION 5 /// /// @ingroup FfxGPUBlur #ifndef FFX_BLUR_OPTION_KERNEL_DIMENSION #error Please define FFX_BLUR_OPTION_KERNEL_DIMENSION #endif /// FFX_BLUR_KERNEL_RANGE is defined relative to FFX_BLUR_OPTION_KERNEL_DIMENSION /// See ffx_blur_callbacks_*.h for details. /// /// @ingroup FfxGPUBlur #ifndef FFX_BLUR_KERNEL_RANGE #error Please define FFX_BLUR_KERNEL_RANGE #endif //-------------------------------------------------------------------------------------- // BLUR CONFIG //-------------------------------------------------------------------------------------- // hardcoded variants #define BLUR_DEBUG_PREFILL_OUTPUT_CACHE_WITH_COLOR 0 #define BLUR_GROUPSHARED_MEMORY_SOA 0 // [Deprecated] improves LDS but too high traffic still #define BLUR_GROUPSHARED_MEMORY_HALF 0 // [Deprecated] LOTS of LDS traffic (1x ds_read per channel), need to pack with FfxUInt32 #define BLUR_GROUPSHARED_MEMORY_PK_UINT 1 // 1x ds_read2st64_b32 for all three channels #define BLUR_FP16_KERNEL_LOOPS 1 // use fp16 for kernel loop counters and lds indexing (increases VGPR due to sdwa) #define BLUR_FP16_CLAMP 1 // ensure fp16 min/max is used for clamp() // cpu-driven variants #ifndef BLUR_ENABLE_INPUT_CACHE #define BLUR_ENABLE_INPUT_CACHE 0 // currently only slows the algorithm :( #endif #ifndef BLUR_DISABLE_CLAMP #define BLUR_DISABLE_CLAMP 0 // Generates incorrect image at the image borders (no clamp), for testing theoretical speed #endif #ifndef BLUR_OPTIMIZED_CLAMP #define BLUR_OPTIMIZED_CLAMP 0 // [Experimental] Testing a new optimized clamp ISA #endif // constants #define BLUR_TILE_SIZE_Y_INV (1.0 / FFX_BLUR_TILE_SIZE_Y) //-------------------------------------------------------------------------------------- // GROUPSHARED MEMORY //-------------------------------------------------------------------------------------- // Define CacheTypes #if BLUR_GROUPSHARED_MEMORY_SOA #if BLUR_GROUPSHARED_MEMORY_HALF #ifdef FFX_HLSL #define BLUR_GROUPSHARED_MEMORY_TYPE groupshared FfxFloat16 #else #define BLUR_GROUPSHARED_MEMORY_TYPE shared FfxFloat16 #endif #else #ifdef FFX_HLSL #define BLUR_GROUPSHARED_MEMORY_TYPE groupshared FfxFloat32 #else #define BLUR_GROUPSHARED_MEMORY_TYPE shared FfxFloat32 #endif #endif #else // BLUR_GROUPSHARED_MEMORY_SOA #if BLUR_GROUPSHARED_MEMORY_HALF #ifdef FFX_HLSL #define BLUR_GROUPSHARED_MEMORY_TYPE groupshared FfxFloat16x3 #else #define BLUR_GROUPSHARED_MEMORY_TYPE shared FfxFloat16x3 #endif #else #ifdef FFX_HLSL #define BLUR_GROUPSHARED_MEMORY_TYPE groupshared FfxFloat32x3 #else #define BLUR_GROUPSHARED_MEMORY_TYPE shared FfxFloat32x3 #endif #endif #endif // BLUR_GROUPSHARED_MEMORY_SOA //============================================================================================================================== // MATH HELPERS //============================================================================================================================== #define DIV_AND_ROUND_UP(x, y) (((x) + ((y)-1)) / ((y))) // Fast modulo operator for powers of two values for Y: x % y == x & (y-1) #define FAST_MOD(x, y) ((x) & (y - 1)) #if FFX_HALF #define FAST_MOD16(x, y) ((x) & (y - FfxInt16(1))) #endif // OUTPUT CACHE ######################################## /* # Notes from Jordan's Presentation src: https://gpuopen.com/gdc-presentations/2019/gdc-2019-s5-blend-of-gcn-optimization-and-color-processing.pdf Use 2^n tiles to use bitwise AND in place of the more ALU-expensive % operator, see #define FAST_MOD above MinTiles -> Ceil(HalfKernel / TileSize) * 2 + 1 */ #if FFX_BLUR_OPTION_KERNEL_DIMENSION > 7 #define NUM_TILES_OUTPUT_CACHE 8 #else #define NUM_TILES_OUTPUT_CACHE 4 #endif #define NUM_PIXELS_OUTPUT_CACHE (FFX_BLUR_TILE_SIZE_Y * FFX_BLUR_TILE_SIZE_X * NUM_TILES_OUTPUT_CACHE) #if BLUR_GROUPSHARED_MEMORY_PK_UINT #ifdef FFX_HLSL groupshared FfxUInt32 OutputCacheRG[NUM_PIXELS_OUTPUT_CACHE]; // RG: 2x fp16's are packed into 32bit unsigned int groupshared FfxFloat32 OutputCacheB [NUM_PIXELS_OUTPUT_CACHE]; // B : don't use fp16 for B to avoid bank conflicts #else shared FfxUInt32 OutputCacheRG[NUM_PIXELS_OUTPUT_CACHE]; // RG: 2x fp16's are packed into 32bit unsigned int shared FfxFloat32 OutputCacheB [NUM_PIXELS_OUTPUT_CACHE]; // B : don't use fp16 for B to avoid bank conflicts #endif #else #if BLUR_GROUPSHARED_MEMORY_SOA BLUR_GROUPSHARED_MEMORY_TYPE OutputCache[NUM_PIXELS_OUTPUT_CACHE * 3]; // stores rrrrrr...ggggggg...bbbbb... #else BLUR_GROUPSHARED_MEMORY_TYPE OutputCache[NUM_PIXELS_OUTPUT_CACHE]; // stores rgbrgbrgbrgbrgbrgbrgbrgb... #endif // BLUR_GROUPSHARED_MEMORY_SOA #endif // BLUR_GROUPSHARED_MEMORY_PK_UINT // Based on the FFX_BLUR_OPTION_KERNEL_DIMENSION, we will need to pre-fill a number of tiles. // e.g. TILE_SIZE_Y=8 // ----------------------------------------- // kernel = 3 | NUM_PREFILL_TILES_OUTPUT_CACHE = 1 | // kernel = 5 | NUM_PREFILL_TILES_OUTPUT_CACHE = 1 | // kernel = 7 | NUM_PREFILL_TILES_OUTPUT_CACHE = 1 | // kernel = 9* | NUM_PREFILL_TILES_OUTPUT_CACHE = 2*| // kernel = 11 | NUM_PREFILL_TILES_OUTPUT_CACHE = 2 | // kernel = 13 | NUM_PREFILL_TILES_OUTPUT_CACHE = 2 | // kernel = 15 | NUM_PREFILL_TILES_OUTPUT_CACHE = 2 | // kernel = 17*| NUM_PREFILL_TILES_OUTPUT_CACHE = 3*| // kernel = 19 | NUM_PREFILL_TILES_OUTPUT_CACHE = 3 | // kernel = 21 | NUM_PREFILL_TILES_OUTPUT_CACHE = 3 | // kernel = 23 | NUM_PREFILL_TILES_OUTPUT_CACHE = 3 | // ----------------------------------------- #define NUM_PREFILL_TILES_OUTPUT_CACHE DIV_AND_ROUND_UP(FFX_BLUR_OPTION_KERNEL_DIMENSION, FFX_BLUR_TILE_SIZE_Y) // INPUT CACHE ######################################## #if BLUR_ENABLE_INPUT_CACHE #define INPUT_CACHE_TILE_SIZE_X (FFX_BLUR_TILE_SIZE_X + FFX_BLUR_OPTION_KERNEL_DIMENSION - 1) #define NUM_TILES_INPUT_CACHE 1 #define NUM_PIXELS_INPUT_CACHE ((INPUT_CACHE_TILE_SIZE_X * FFX_BLUR_TILE_SIZE_Y) * NUM_TILES_INPUT_CACHE) #if BLUR_GROUPSHARED_MEMORY_PK_UINT #ifdef FFX_HLSL groupshared FfxUInt32 InputCacheRG[NUM_PIXELS_INPUT_CACHE]; // RG: 2x fp16's are packed into 32bit unsigned int groupshared FfxFloat32 InputCacheB [NUM_PIXELS_INPUT_CACHE]; // B : don't use fp16 for B to avoid bank conflicts #else shared FfxUInt32 InputCacheRG[NUM_PIXELS_INPUT_CACHE]; // RG: 2x fp16's are packed into 32bit unsigned int shared FfxFloat32 InputCacheB [NUM_PIXELS_INPUT_CACHE]; // B : don't use fp16 for B to avoid bank conflicts #endif #else #if BLUR_GROUPSHARED_MEMORY_SOA BLUR_GROUPSHARED_MEMORY_TYPE InputCache[NUM_PIXELS_INPUT_CACHE * 3]; // stores rrrrrr...ggggggg...bbbbb... #else BLUR_GROUPSHARED_MEMORY_TYPE InputCache[NUM_PIXELS_INPUT_CACHE]; // stores rgbrgbrgbrgbrgbrgbrgbrgb... #endif // BLUR_GROUPSHARED_MEMORY_SOA #endif //BLUR_GROUPSHARED_MEMORY_PK_UINT #endif // BLUR_ENABLE_INPUT_CACHE //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //============================================================================================================================== // GROUPSHARED MEMORY MAPPING FUNCTIONS //============================================================================================================================== // LDS ops ---------------------------------------------------------------------------------- // LDS TILES : FFX_BLUR_TILE_SIZE_X * FFX_BLUR_TILE_SIZE_Y sized tiles // e.g. FFX_BLUR_TILE_SIZE_X = 8 // // <------------ FFX_BLUR_TILE_SIZE_X --------------> // ^ OutputCache[0-7] // | OutputCache[8-15] // | // | // TILE #1 // | // | // v OutputCache[56-63] // <------------------------------------------------> // | // <------------ FFX_BLUR_TILE_SIZE_X --------------> // ^ OutputCache[64-71] // | // | // | // TILE #2 // | // | // v // <------------------------------------------------> // | //... FfxUInt32 PackF2(FfxFloat32x2 c) { return (ffxF32ToF16(c.r) << 16) | ffxF32ToF16(c.g); } FfxFloat32x2 UnpackToF2(FfxUInt32 packedRG) { #ifdef FFX_HLSL return f16tof32(FfxUInt32x2(packedRG >> 16, packedRG & 0xFFFF)); #else return unpackHalf2x16(packedRG).yx; #endif } #if FFX_HALF FfxUInt32 PackH2(FfxFloat16x2 c) { return (ffxF32ToF16(FfxFloat32(c.r)) << 16) | ffxF32ToF16(FfxFloat32(c.g)); } // TODO: is there a cast fp16->FfxUInt32 and skip fp16->fp32 promotion? FfxFloat16x2 UnpackToH2(FfxUInt32 packedRG){ return FfxFloat16x2(UnpackToF2(packedRG)); } #endif #ifdef FFX_HLSL inline FfxUInt32 FlattenIndex(FfxInt32x2 Index, FfxInt32 ElementStride) #else FfxUInt32 FlattenIndex(FfxInt32x2 Index, FfxInt32 ElementStride) #endif { return Index.x + Index.y * ElementStride; } #if BLUR_GROUPSHARED_MEMORY_SOA void SetOutputCache(FfxInt32x2 index, FfxFloat32x3 value) { FfxInt32 iLDS = index.x + index.y * BLUR_TILE_SIZE_X; OutputCache[iLDS + NUM_PIXELS_OUTPUT_CACHE * 0] = value.r; OutputCache[iLDS + NUM_PIXELS_OUTPUT_CACHE * 1] = value.g; OutputCache[iLDS + NUM_PIXELS_OUTPUT_CACHE * 2] = value.b; } FfxFloat32x3 GetOutputCache(FfxInt32x2 index) { FfxInt32 iLDS = index.x + index.y * FFX_BLUR_TILE_SIZE_X; FfxFloat32x3 c; c.r = OutputCache[iLDS + NUM_PIXELS_OUTPUT_CACHE * 0]; c.g = OutputCache[iLDS + NUM_PIXELS_OUTPUT_CACHE * 1]; c.b = OutputCache[iLDS + NUM_PIXELS_OUTPUT_CACHE * 2]; return c; } #if BLUR_ENABLE_INPUT_CACHE void SetInputCache(FfxInt32x2 index, FfxFloat32x3 value) { FfxInt32 iLDS = index.x + index.y * INPUT_CACHE_TILE_SIZE_X; InputCache[iLDS + NUM_PIXELS_INPUT_CACHE * 0] = value.r; InputCache[iLDS + NUM_PIXELS_INPUT_CACHE * 1] = value.g; InputCache[iLDS + NUM_PIXELS_INPUT_CACHE * 2] = value.b; } FfxFloat32x3 GetInputCache(FfxInt32x2 index) { FfxInt32 iLDS = index.x + index.y * INPUT_CACHE_TILE_SIZE_X; FfxFloat32x3 c; c.r = InputCache[iLDS + NUM_PIXELS_INPUT_CACHE * 0]; c.g = InputCache[iLDS + NUM_PIXELS_INPUT_CACHE * 1]; c.b = InputCache[iLDS + NUM_PIXELS_INPUT_CACHE * 2]; return c; } #endif // BLUR_ENABLE_INPUT_CACHE #else #if BLUR_GROUPSHARED_MEMORY_PK_UINT #if FFX_HALF void SetOutputCache(FfxInt32x2 index, FfxFloat16x3 value) { FfxInt32 iLDS = index.x + index.y * FFX_BLUR_TILE_SIZE_X; OutputCacheRG[iLDS] = PackH2(value.rg); OutputCacheB[iLDS] = value.b; } FfxFloat16x3 GetOutputCache(FfxInt32x2 index) { FfxInt32 iLDS = index.x + index.y * FFX_BLUR_TILE_SIZE_X; FfxFloat16x2 RG = UnpackToH2(OutputCacheRG[iLDS]); return FfxFloat16x3(RG.r, RG.g, OutputCacheB[iLDS]); } #else void SetOutputCache(FfxInt32x2 index, FfxFloat32x3 value) { FfxInt32 iLDS = index.x + index.y * FFX_BLUR_TILE_SIZE_X; OutputCacheRG[iLDS] = PackF2(value.rg); OutputCacheB[iLDS] = value.b; } FfxFloat32x3 GetOutputCache(FfxInt32x2 index) { FfxInt32 iLDS = index.x + index.y * FFX_BLUR_TILE_SIZE_X; FfxFloat32x2 RG = UnpackToF2(OutputCacheRG[iLDS]); return FfxFloat32x3(RG.r, RG.g, OutputCacheB[iLDS]); } #endif // FFX_HALF #else #if FFX_HALF void SetOutputCache(FfxInt32x2 index, FfxFloat16x3 value) { const FfxUInt32 iLDS = FlattenIndex(index, FFX_BLUR_TILE_SIZE_X); OutputCache[iLDS] = value; } FfxFloat16x3 GetOutputCache(FfxInt32x2 index) { const FfxUInt32 iLDS = FlattenIndex(index, FFX_BLUR_TILE_SIZE_X); return OutputCache[iLDS]; } #else void SetOutputCache(FfxInt32x2 index, FfxFloat32x3 value) { const FfxUInt32 iLDS = FlattenIndex(index, FFX_BLUR_TILE_SIZE_X); OutputCache[iLDS] = value; } FfxFloat32x3 GetOutputCache(FfxInt32x2 index) { const FfxUInt32 iLDS = FlattenIndex(index, FFX_BLUR_TILE_SIZE_X); return OutputCache[iLDS]; } #endif // FFX_HALF #endif // BLUR_GROUPSHARED_MEMORY_PK_UINT #if BLUR_ENABLE_INPUT_CACHE #if BLUR_GROUPSHARED_MEMORY_PK_UINT #if FFX_HALF void SetInputCache(FfxInt32x2 index, FfxFloat16x3 value) { FfxInt32 iLDS = FlattenIndex(index, INPUT_CACHE_TILE_SIZE_X); InputCacheRG[iLDS] = PackH2(value.rg); InputCacheB[iLDS] = value.b; } FfxFloat16x3 GetInputCache(FfxInt32x2 index) { FfxInt32 iLDS = FlattenIndex(index, INPUT_CACHE_TILE_SIZE_X); FfxFloat16x2 RG = UnpackToH2(InputCacheRG[iLDS]); return FfxFloat16x3(RG.r, RG.g, InputCacheB[iLDS]); } #else void SetInputCache(FfxInt32x2 index, FfxFloat32x3 value) { //FfxInt32 iLDS = FlattenIndex(index, INPUT_CACHE_TILE_SIZE_X); FfxInt32 iLDS = index.x + index.y * INPUT_CACHE_TILE_SIZE_X; InputCacheRG[iLDS] = PackF2(value.rg); InputCacheB[iLDS] = value.b; } FfxFloat32x3 GetInputCache(FfxInt32x2 index) { //FfxInt32 iLDS = FlattenIndex(index, INPUT_CACHE_TILE_SIZE_X); FfxInt32 iLDS = index.x + index.y * INPUT_CACHE_TILE_SIZE_X; FfxFloat32x2 RG = UnpackToF2(InputCacheRG[iLDS]); return FfxFloat32x3(RG.r, RG.g, InputCacheB[iLDS]); } #endif // FFX_HALF #else void SetInputCache(FfxInt32x2 index, FfxFloat32x3 value) { InputCache[index.x + index.y * INPUT_CACHE_TILE_SIZE_X].rgb = value; } FfxFloat32x3 GetInputCache(FfxInt32x2 index) { return InputCache[index.x + index.y * INPUT_CACHE_TILE_SIZE_X].rgb; } #endif // BLUR_GROUPSHARED_MEMORY_PK_UINT #endif // BLUR_ENABLE_INPUT_CACHE #endif // BLUR_GROUPSHARED_MEMORY_SOA void LDSBarrier() { FFX_GROUP_MEMORY_BARRIER; } // index of the LDS tile in the ring buffer #if FFX_HALF #if BLUR_FP16_KERNEL_LOOPS FfxInt16 GetOutputCacheTile(FfxInt16 iTile) { return FAST_MOD16(iTile, FfxInt16(NUM_TILES_OUTPUT_CACHE)); } void CacheInOutputTile(FfxInt32x2 threadID, FfxInt16 iTile, FfxFloat16x3 color) { FfxInt16 iLDSTile = GetOutputCacheTile(iTile); FfxInt16x2 TileOffset = FfxInt16x2(0, FfxInt16(FFX_BLUR_TILE_SIZE_Y) * iLDSTile); FfxInt16x2 iLDS = FfxInt16x2(threadID) + TileOffset; SetOutputCache(iLDS, color); } FfxFloat16x3 LoadFromCachedOutputTile(FfxInt32x2 threadID, FfxInt16 iTile) { FfxInt16 iLDSTile = GetOutputCacheTile(iTile); FfxInt16x2 TileOffset = FfxInt16x2(0, FfxInt16(FFX_BLUR_TILE_SIZE_Y) * iLDSTile); FfxInt16x2 iLDS = FfxInt16x2(threadID) + TileOffset; return GetOutputCache(iLDS); } #else // BLUR_FP16_KERNEL_LOOPS FfxInt32 GetOutputCacheTile(FfxInt32 iTile) { return FAST_MOD(iTile, NUM_TILES_OUTPUT_CACHE); } void CacheInOutputTile(FfxInt32x2 threadID, FfxInt32 iTile, FfxFloat16x3 color) { FfxInt32 iLDSTile = GetOutputCacheTile(iTile); FfxInt32x2 TileOffset = FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iLDSTile); FfxInt32x2 iLDS = threadID + TileOffset; SetOutputCache(iLDS, color); } FfxFloat32x3 LoadFromCachedOutputTile(FfxInt32x2 threadID, FfxInt32 iTile) { FfxInt32 iLDSTile = GetOutputCacheTile(iTile); FfxInt32x2 TileOffset = FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iLDSTile); FfxInt32x2 iLDS = threadID + TileOffset; return GetOutputCache(iLDS); } #endif // BLUR_FP16_KERNEL_LOOPS #else FfxInt32 GetOutputCacheTile(FfxInt32 iTile) { return FAST_MOD(iTile, NUM_TILES_OUTPUT_CACHE); } void CacheInOutputTile(FfxInt32x2 threadID, FfxInt32 iTile, FfxFloat32x3 color) { FfxInt32 iLDSTile = GetOutputCacheTile(iTile); // map image tile to LDS ring buffered tiles FfxInt32x2 TileOffset = FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iLDSTile); // pixel offset for this tile FfxInt32x2 iLDS = threadID + TileOffset; // 2D LDS coord based on local thread ID SetOutputCache(iLDS, color); } FfxFloat32x3 LoadFromCachedOutputTile(FfxInt32x2 threadID, FfxInt32 iTile) { FfxInt32 iLDSTile = GetOutputCacheTile(iTile); // map image tile to LDS ring buffered tiles FfxInt32x2 TileOffset = FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iLDSTile); // pixel offset for this tile FfxInt32x2 iLDS = threadID + TileOffset; // 2D LDS coord based on local thread ID return GetOutputCache(iLDS); } #endif // FFX_HALF #if BLUR_ENABLE_INPUT_CACHE #if FFX_HALF void CacheInInputTile(FfxInt32x2 threadID, FfxInt32 KernelOffset, FfxFloat16x3 c) #else void CacheInInputTile(FfxInt32x2 threadID, FfxInt32 KernelOffset, FfxFloat32x3 c) #endif { FfxInt32x2 InputCacheCoord = threadID + FfxInt32x2(KernelOffset, 0) + FfxInt32x2(FFX_BLUR_KERNEL_RANGE-1, 0); SetInputCache(InputCacheCoord, c); } #if FFX_HALF FfxFloat16x3 LoadFromCachedInputTile(FfxInt32x2 threadID, FfxInt32 KernelOffset) #else FfxFloat32x3 LoadFromCachedInputTile(FfxInt32x2 threadID, FfxInt32 KernelOffset) #endif { FfxInt32x2 InputCacheCoord = threadID + FfxInt32x2(KernelOffset,0) + FfxInt32x2(FFX_BLUR_KERNEL_RANGE-1, 0); return GetInputCache(InputCacheCoord); } #endif // BLUR_ENABLE_INPUT_CACHE //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //============================================================================================================================== // BLUR FUNCTIONS //============================================================================================================================== #if FFX_HALF FfxFloat16x3 HorizontalBlurFromTexture(FfxInt32x2 CenterPixelLocation, FfxInt32x2 ImageSize) { #if BLUR_FP16_CLAMP // this uses 4 less VGPRs but not faster const FfxInt16 ImageSizeClampValueX = FfxInt16(ImageSize.x - 1); FfxFloat16x3 BlurredImage = BlurLoadInput(FfxInt16x2(CenterPixelLocation)) * BlurLoadKernelWeight(0); for (FfxInt32 i = 1; i < FFX_BLUR_KERNEL_RANGE; ++i) { FfxInt32x2 Offset = FfxInt32x2(i, 0); FfxInt16x2 SampleCoordXX = FfxInt16x2(CenterPixelLocation.x + i, CenterPixelLocation.x - i); #if !BLUR_DISABLE_CLAMP SampleCoordXX = clamp(SampleCoordXX, FfxInt16x2(0, 0), FfxInt16x2(ImageSizeClampValueX, ImageSizeClampValueX)); #endif BlurredImage += BlurLoadInput(FfxInt16x2(SampleCoordXX[0], CenterPixelLocation.y)) * BlurLoadKernelWeight(i); BlurredImage += BlurLoadInput(FfxInt16x2(SampleCoordXX[1], CenterPixelLocation.y)) * BlurLoadKernelWeight(i); } #else FfxFloat16x3 BlurredImage = FfxFloat16x3(0.f, 0.f, 0.f); for (FfxInt32 i = -FFX_BLUR_KERNEL_RANGE + 1; i < FFX_BLUR_KERNEL_RANGE; ++i) { FfxInt32x2 SampleCoord = CenterPixelLocation + FfxInt32x2(i, 0); // horizontal blur #if !BLUR_DISABLE_CLAMP SampleCoord.x = clamp(SampleCoord.x, 0, ImageSize.x-1); // clamp #endif FfxFloat16x3 c = BlurLoadInput(SampleCoord); BlurredImage += c * BlurLoadKernelWeight(abs(i)); } #endif // BLUR_FP16_CLAMP return BlurredImage; } #else // FFX_HALF FfxFloat32x3 HorizontalBlurFromTexture(FfxInt32x2 CenterPixelLocation, FfxInt32x2 ImageSize) { FfxFloat32x3 BlurredImage = FfxFloat32x3(0.f, 0.f, 0.f); for (FfxInt32 i = -FFX_BLUR_KERNEL_RANGE + 1; i < FFX_BLUR_KERNEL_RANGE; ++i) { FfxInt32x2 SampleCoord = CenterPixelLocation + FfxInt32x2(i, 0); // horizontal blur #if !BLUR_DISABLE_CLAMP SampleCoord.x = clamp(SampleCoord.x, 0, ImageSize.x-1); // clamp #endif FfxFloat32x3 c = BlurLoadInput(SampleCoord); BlurredImage += c * BlurLoadKernelWeight(abs(i)); } return BlurredImage; } #endif // FFX_HALF #if BLUR_ENABLE_INPUT_CACHE #if FFX_HALF FfxFloat16x3 HorizontalBlurFromCachedInput(FfxInt32x2 threadID) #else FfxFloat32x3 HorizontalBlurFromCachedInput(FfxInt32x2 threadID) #endif { #if FFX_HALF FfxFloat16x3 BlurredImage = FfxFloat16x3(0.f, 0.f, 0.f); #else FfxFloat32x3 BlurredImage = FfxFloat32x3(0.f, 0.f, 0.f); #endif // FFX_HALF for (FfxInt32 i = -FFX_BLUR_KERNEL_RANGE + 1; i < FFX_BLUR_KERNEL_RANGE; ++i) { BlurredImage += LoadFromCachedInputTile(threadID, i) * BlurLoadKernelWeight(abs(i)); } return BlurredImage; } #endif // BLUR_ENABLE_INPUT_CACHE #if FFX_HALF #if BLUR_FP16_KERNEL_LOOPS FfxFloat16x3 VerticalBlurFromCachedOutput(FfxInt32x2 ThreadID, FfxInt32x2 WorkGroupID,FfxInt16x2 CenterPixelLocation, FfxInt16x2 ImageSize) { const FfxInt16x2 ImageSizeClampValueXY = ImageSize.xy - FfxInt16x2(1, 1); const FfxUInt32 iTileCount = DIV_AND_ROUND_UP(FfxUInt32(ImageSize.y), FFX_BLUR_TILE_SIZE_Y * FFX_BLUR_DISPATCH_Y); FfxFloat16x3 value = FfxFloat16x3(0, 0, 0); #ifndef FFX_HLSL // For some reason using 16 bit integer for this loop in glsl does not work. It seems to be due to the use of // a negative value as a starting value that is compared to a positive value, which seems to incorrectly cause // the condition to always be false. for (FfxInt32 i = (FfxInt32(-FFX_BLUR_KERNEL_RANGE) + FfxInt32(1)); i < FfxInt32(FFX_BLUR_KERNEL_RANGE); ++i) #else for (FfxInt16 i = (FfxInt16(-FFX_BLUR_KERNEL_RANGE) + FfxInt16(1)); i < FfxInt16(FFX_BLUR_KERNEL_RANGE); ++i) #endif { FfxInt16x2 KernelSampleLocation = CenterPixelLocation + FfxInt16x2(0, i); #if !BLUR_DISABLE_CLAMP KernelSampleLocation.xy = clamp(KernelSampleLocation.xy, FfxInt16x2(0, 0), ImageSizeClampValueXY); #endif const FfxInt16 iTile_ImageSpace = FfxInt16(KernelSampleLocation.y * BLUR_TILE_SIZE_Y_INV); const FfxInt16 iTile = FfxInt16(iTile_ImageSpace - iTileCount * WorkGroupID.y); FfxInt16x2 TileThreadID = FfxInt16x2(ThreadID.x, FAST_MOD16(KernelSampleLocation.y, FfxInt16(FFX_BLUR_TILE_SIZE_Y))); FfxFloat16x3 c = LoadFromCachedOutputTile(TileThreadID, iTile); value += c * BlurLoadKernelWeight(abs(i)); } return value; } #else FfxFloat16x3 VerticalBlurFromCachedOutput(FfxInt32x2 ThreadID, FfxInt32x2 WorkGroupID, FfxInt32x2 CenterPixelLocation, FfxInt32x2 ImageSize) { const FfxUInt32 iTileCount = DIV_AND_ROUND_UP(ImageSize.y, FFX_BLUR_TILE_SIZE_Y * FFX_BLUR_DISPATCH_Y); FfxFloat16x3 value = FfxFloat16x3(0, 0, 0); #if BLUR_FP16_CLAMP const FfxInt16x2 ClampUpperLimitXY = ImageSize.xy - 1; for (FfxInt16 i = -FFX_BLUR_KERNEL_RANGE + 1; i < FFX_BLUR_KERNEL_RANGE; ++i) { FfxInt16x2 KernelSampleLocation = CenterPixelLocation + FfxInt16x2(0, i); #if !BLUR_DISABLE_CLAMP #if BLUR_OPTIMIZED_CLAMP bool bNegative = firstbithigh(KernelSampleLocation.x) == 31; KernelSampleLocation.y = bNegative ? 0 : KernelSampleLocation.y; #else KernelSampleLocation.xy = clamp(KernelSampleLocation.xy, FfxInt16x2(0, 0), ClampUpperLimitXY); #endif // BLUR_OPTIMIZED_CLAMP #endif const FfxInt16 iTile_ImageSpace = FfxInt16(KernelSampleLocation.y * FFX_BLUR_TILE_SIZE_Y_INV); const FfxInt16 iTile = iTile_ImageSpace - iTileCount * WorkGroupID.y; FfxInt16x2 TileThreadID = FfxInt16x2(ThreadID.x, FAST_MOD16(KernelSampleLocation.y, FfxInt16(FFX_BLUR_TILE_SIZE_Y))); FfxFloat16x3 c = LoadFromCachedOutputTile(TileThreadID, iTile); value += c * BlurLoadKernelWeight(abs(i)); } #else for (FfxInt32 i = -FFX_BLUR_KERNEL_RANGE + 1; i < FFX_BLUR_KERNEL_RANGE; ++i) { FfxInt32x2 KernelSampleLocation = CenterPixelLocation + FfxInt32x2(0, i); #if !BLUR_DISABLE_CLAMP #if BLUR_OPTIMIZED_CLAMP bool bNegative = firstbithigh(KernelSampleLocation.x) == 31; KernelSampleLocation.y = bNegative ? 0 : KernelSampleLocation.y; #else KernelSampleLocation.xy = clamp(KernelSampleLocation.xy, 0, ImageSize.xy-1); #endif // BLUR_OPTIMIZED_CLAMP #endif const FfxInt32 iTile_ImageSpace = FfxInt32(KernelSampleLocation.y * FFX_BLUR_TILE_SIZE_Y_INV); const FfxInt32 iTile = iTile_ImageSpace - (iTileCountPerWorkgroup * WorkGroupID.y); FfxInt32x2 TileThreadID = FfxInt32x2(ThreadID.x, FAST_MOD(KernelSampleLocation.y, FFX_BLUR_TILE_SIZE_Y)); FfxFloat16x3 c = LoadFromCachedOutputTile(TileThreadID, iTile); value += c * BlurLoadKernelWeight(abs(i)); } #endif // BLUR_FP16_CLAMP return value; } #endif // BLUR_FP16_KERNEL_LOOPS #else // FFX_HALF FfxFloat32x3 VerticalBlurFromCachedOutput(FfxInt32x2 ThreadID, FfxInt32x2 WorkGroupID,FfxInt32x2 CenterPixelLocation, FfxInt32x2 ImageSize) { const FfxInt32 iTileCountPerWorkgroup = DIV_AND_ROUND_UP(ImageSize.y, FFX_BLUR_TILE_SIZE_Y * FFX_BLUR_DISPATCH_Y); FfxFloat32x3 value = FfxFloat32x3(0,0,0); for (FfxInt32 i = -FFX_BLUR_KERNEL_RANGE + 1; i < FFX_BLUR_KERNEL_RANGE; ++i) { FfxInt32x2 KernelSampleLocation = CenterPixelLocation + FfxInt32x2(0, i); #if !BLUR_DISABLE_CLAMP KernelSampleLocation.xy = clamp(KernelSampleLocation.xy, FfxInt32x2(0, 0), ImageSize.xy-1); #endif // which 'global' tile in the image space const FfxInt32 iTile_ImageSpace = FfxInt32(KernelSampleLocation.y * BLUR_TILE_SIZE_Y_INV); // local tile in this workgroup - apply the offset to convert to local space tile coordinates // this is needed for workgroups that have WorkgroupID.y > 0: the previous workgroup's // tile mapping doesn't have to align with the current one's depending on the FFX_BLUR_TILE_SIZE_XY. // e.g. WorkGroupID=1's first tile will map to 0 in local space, but could be some non-0 index // in the local space of the previous workgroup (WorkGroupID=0). // Not correcting for this mapping will result in a chopped image on the workgroup borders. const FfxInt32 iTile = iTile_ImageSpace - (iTileCountPerWorkgroup * WorkGroupID.y); FfxInt32x2 TileThreadID = FfxInt32x2(ThreadID.x, FAST_MOD(KernelSampleLocation.y, FFX_BLUR_TILE_SIZE_Y)); //FfxInt32x2 TileThreadID = FfxInt32x2(ThreadID.x, KernelSampleLocation.y % BLUR_TILE_SIZE_Y); value += LoadFromCachedOutputTile(TileThreadID, iTile) * BlurLoadKernelWeight(abs(i)); } return value; } #endif // FFX_HALF //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //============================================================================================================================== // INPUT/OUTPUT CACHE HELPERS //============================================================================================================================== // #if BLUR_ENABLE_INPUT_CACHE // Fills the input cache with the corresponding rgion from the image + kernel extents void FillInputCache(in FfxInt32x2 lxy, FfxInt32x2 CenterPixelLocation, FfxInt32x2 ImageSize) { #if FFX_HALF FfxFloat16x3 c = FfxFloat16x3(0, 0, 0); #else FfxFloat32x3 c = FfxFloat32x3(0, 0, 0); #endif // slide the thread group over the InputCache const FfxInt32 iNumLoops = DIV_AND_ROUND_UP(INPUT_CACHE_TILE_SIZE_X, FFX_BLUR_TILE_SIZE_X); for (FfxInt32 i = 0; i < iNumLoops; ++i) { FfxInt32x2 LDSCoord = lxy - FfxInt32x2(FFX_BLUR_KERNEL_RANGE - 1, 0) + FfxInt32x2(i * FFX_BLUR_TILE_SIZE_X, 0); FfxInt32x2 SamplePosition = CenterPixelLocation - FfxInt32x2(FFX_BLUR_KERNEL_RANGE - 1, 0) + FfxInt32x2(i * FFX_BLUR_TILE_SIZE_X, 0); #if !BLUR_DISABLE_CLAMP SamplePosition.x = clamp(SamplePosition.x, 0, ImageSize.x - 1); #endif c = BlurLoadInput(SamplePosition); // clamp to LDS bounds if we're on the last iteration of the loop if (i == iNumLoops - 1) { // faster than 'if (LDSCoord.x < INPUT_CACHE_TILE_SIZE_X)', avoids a vmem sync at the cost of some ALU LDSCoord.x = clamp(LDSCoord.x, -FfxInt32x2(FFX_BLUR_KERNEL_RANGE - 1, 0), (INPUT_CACHE_TILE_SIZE_X - 1) - FfxInt32x2(FFX_BLUR_KERNEL_RANGE - 1, 0)); } CacheInInputTile(LDSCoord, 0, c); } LDSBarrier(); } #endif // BLUR_ENABLE_INPUT_CACHE // Fills the output cache with the horizontally-blurred image. void PreFillOutputCache(in FfxInt32x2 gxy, in FfxInt32x2 lxy, in FfxInt32x2 WorkGroupID, FfxInt32x2 ImageSize) { #if BLUR_DEBUG_PREFILL_OUTPUT_CACHE_WITH_COLOR FfxFloat32x3 FillColor = FfxFloat32x3(0, 0, 0); // black border color [unroll] for (FfxInt32 iTile = 0; iTile < NUM_TILES_OUTPUT_CACHE; ++iTile) { CacheInOutputTile(lxy, iTile, FillColor); } LDSBarrier(); #endif // load from VMEM the first NUM_PREFILL_TILES_OUTPUT_CACHE tiles // while doing the horizontal blur, going top down #if FFX_HALF for (FfxInt16 j = FfxInt16(0); j < FfxInt16(NUM_PREFILL_TILES_OUTPUT_CACHE); ++j) #else for (FfxInt32 j = 0; j < NUM_PREFILL_TILES_OUTPUT_CACHE; ++j) #endif { const FfxInt32x2 ImageCoordinate = gxy + FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * j); #if FFX_HALF FfxFloat16x3 c = HorizontalBlurFromTexture(ImageCoordinate, ImageSize); #else FfxFloat32x3 c = HorizontalBlurFromTexture(ImageCoordinate, ImageSize); #endif CacheInOutputTile(lxy, j, c); } #if FFX_BLUR_DISPATCH_Y != 1 // for any workgroup that doesn't start frop the top of the image, // fill the cache from the tail, going upwards in the image space if (WorkGroupID.y != 0) { #if FFX_HALF FfxFloat16x3 c = FfxFloat16x3(0, 0, 0); #else FfxFloat32x3 c = FfxFloat32x3(0, 0, 0); #endif #if FFX_HALF for (FfxInt16 j = FfxInt16(1); j < FfxInt16(NUM_PREFILL_TILES_OUTPUT_CACHE+1 + (DIV_AND_ROUND_UP(FFX_BLUR_KERNEL_RANGE, FFX_BLUR_TILE_SIZE_Y))); ++j) #else for (FfxInt32 j = 1; j < NUM_PREFILL_TILES_OUTPUT_CACHE+1 + (DIV_AND_ROUND_UP(FFX_BLUR_KERNEL_RANGE, FFX_BLUR_TILE_SIZE_Y)); ++j) #endif { const FfxInt32x2 ImageCoordinate = gxy - FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * j); #if BLUR_ENABLE_INPUT_CACHE FillInputCache(lxy, ImageCoordinate, ImageSize); c = HorizontalBlurFromCachedInput(lxy); #else c = HorizontalBlurFromTexture(ImageCoordinate, ImageSize); #endif #if FFX_HALF CacheInOutputTile(lxy, (FfxInt16(NUM_TILES_OUTPUT_CACHE) - j), c); #else CacheInOutputTile(lxy, (NUM_TILES_OUTPUT_CACHE - j), c); #endif } } #endif LDSBarrier(); // OutputCache Sync: Read -> Write ========================================= } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //============================================================================================================================== // BLUR GAUSSIAN BLUR ALGORITHM //============================================================================================================================== /// ffxBlur: The main idea of the algorithm is to utilize a number of tiles (8x8) that are cached on the groupshared memory /// in a ring-buffer fashion to speed up texture lookups in a hand-optimized compute shader. /// The tiles are defined by the FFX_BLUR_TILE_SIZE_X and FFX_BLUR_TILE_SIZE_Y defines, and are typically 8x8 pixels. /// The image is horizontally blurred while being cached on the groupshared memory, /// and when all the groupshared tiles are filled, a vertical blur pass is done on the groupshared memory /// and the result is stored in the UAV as the final destination. /// /// The algorithm is as follows: /// - Pre-fill LDS with 8x8 tiles, storing vertical tiles, containing horizontally blurred color /// - Loop until the entire image is covered: /// - Run a vertical blur pass on the LDS and output to final destination UAV /// - Re-fill LDS with horizontally-blurred data /// - Finish off the remaining last row/section of the image /// /// @param [in] GlobalThreadID The SV_DispatchThreadID.xy or gl_GlobalInvocationID.xy. /// @param [in] WorkGroupLocalThreadID The SV_GroupThreadID.xy or gl_LocalInvocationID.xy. /// @param [in] WorkGroupID The SV_GroupID.xy or gl_WorkGroupID.xy. /// @param [in] ImageSize The two dimensional size of the input and output image. /// /// @ingroup FfxGPUBlur void ffxBlur( in FfxInt32x2 GlobalThreadID, in FfxInt32x2 WorkGroupLocalThreadID, in FfxInt32x2 WorkGroupID, FfxInt32x2 ImageSize) { // Each threadgroup processes a number of tiles of size FFX_BLUR_TILE_SIZE_Y * FFX_BLUR_TILE_SIZE_X // This number depends on the image height and the vertical dimension (_Y) of the FFX_BLUR_TILE_SIZE //const FfxUInt32 iTileCount = DIV_AND_ROUND_UP(ImageSize.y, FFX_BLUR_TILE_SIZE_Y); const FfxUInt32 iTileCount = DIV_AND_ROUND_UP( ImageSize.y, FFX_BLUR_TILE_SIZE_Y * FFX_BLUR_DISPATCH_Y); FfxInt32x2 gxy = FfxInt32x2( WorkGroupID.x * FFX_BLUR_TILE_SIZE_X + WorkGroupLocalThreadID.x , WorkGroupLocalThreadID.y + WorkGroupID.y * iTileCount * FFX_BLUR_TILE_SIZE_Y ); FfxInt32x2 lxy = WorkGroupLocalThreadID; if (gxy.x >= ImageSize.x) return; //------------------------------------------------------------------------------------------------- // STEP #1 //------------------------------------------------------------------------------------------------- // Pre-fill the output cache with a few tiles of horizontally blurred image. // The tile count to pre-fill is a function of kernel width and TileSizeY. PreFillOutputCache(gxy, lxy, WorkGroupID, ImageSize); // doesn't sync waves //------------------------------------------------------------------------------------------------- // STEP #2 //------------------------------------------------------------------------------------------------- // loop through the tiles and write out to UAV as we go from top to down FfxInt32 iTileOutput = 0; #if FFX_HALF FfxFloat16x3 c = FfxFloat16x3(0, 0, 0); #else FfxFloat32x3 c = FfxFloat32x3(0, 0, 0); #endif for (; iTileOutput < iTileCount - NUM_PREFILL_TILES_OUTPUT_CACHE; ++iTileOutput) { // index of next tile that we'll cache the output to // It runs ahead of the tile we will be writing out to UAV by NUM_PREFILL_TILES_OUTPUT_CACHE tiles FfxInt32 iNextTileOutputCache = iTileOutput + NUM_PREFILL_TILES_OUTPUT_CACHE; const FfxInt32x2 HorizontalBlurInputCoord = gxy + FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iNextTileOutputCache); // run horizontal blur & cache the next output tile #if BLUR_ENABLE_INPUT_CACHE FillInputCache(lxy, HorizontalBlurInputCoord, ImageSize); c = HorizontalBlurFromCachedInput(lxy); #else // Number of image_load instructions will scale with FFX_BLUR_OPTION_KERNEL_DIMENSION. c = HorizontalBlurFromTexture(HorizontalBlurInputCoord, ImageSize); #endif // BLUR_ENABLE_INPUT_CACHE #if FFX_HALF CacheInOutputTile(lxy, FfxInt16(iNextTileOutputCache), c); #else CacheInOutputTile(lxy, iNextTileOutputCache, c); #endif LDSBarrier(); // OutputCache Sync: Write -> Read ========================================= // Start writing out the pixel value which has its final value // convolved from the pixels aready in the LDS section. const FfxInt32x2 OutputCoord = gxy + FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iTileOutput); #if FFX_HALF c = VerticalBlurFromCachedOutput(lxy, WorkGroupID, FfxInt16x2(OutputCoord), FfxInt16x2(ImageSize)); #else c = VerticalBlurFromCachedOutput(lxy, WorkGroupID, OutputCoord, ImageSize); #endif BlurStoreOutput(OutputCoord, c); LDSBarrier(); // OutputCache Sync: Read -> Write ========================================= } //------------------------------------------------------------------------------------------------- // STEP #3 //------------------------------------------------------------------------------------------------- // fill in the remaining last tiles (= loop for NUM_PREFILL_TILES_OUTPUT_CACHE) for (; iTileOutput < iTileCount; ++iTileOutput) { const FfxInt32x2 OutputCoord = gxy + FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iTileOutput); if (iTileOutput >= iTileCount - NUM_PREFILL_TILES_OUTPUT_CACHE) { FfxInt32 iNextTileOutputCache = iTileOutput + NUM_PREFILL_TILES_OUTPUT_CACHE; const FfxInt32x2 HorizontalBlurInputCoord = gxy + FfxInt32x2(0, FFX_BLUR_TILE_SIZE_Y * iNextTileOutputCache); // run horizontal blur & cache the next output tile #if BLUR_ENABLE_INPUT_CACHE FillInputCache(lxy, HorizontalBlurInputCoord, ImageSize); c = HorizontalBlurFromCachedInput(lxy); #else // Number of image_load instructions will scale with FFX_BLUR_OPTION_KERNEL_DIMENSION. c = HorizontalBlurFromTexture(HorizontalBlurInputCoord, ImageSize); #endif // BLUR_ENABLE_INPUT_CACHE #if FFX_HALF CacheInOutputTile(lxy, FfxInt16(iNextTileOutputCache), c); #else CacheInOutputTile(lxy, iNextTileOutputCache, c); #endif LDSBarrier(); // OutputCache Sync: Write -> Read ========================================= } #if FFX_HALF c = VerticalBlurFromCachedOutput(lxy, WorkGroupID, FfxInt16x2(OutputCoord), FfxInt16x2(ImageSize)); #else c = VerticalBlurFromCachedOutput(lxy, WorkGroupID, OutputCoord, ImageSize); #endif BlurStoreOutput(OutputCoord, c); } } #endif // !FFX_CPU #endif // FFX_BLUR_H