// This file is part of the FidelityFX SDK. // // Copyright (C) 2024 Advanced Micro Devices, Inc. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and /or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions : // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "ffx_core.h" #include "ffx_dof_common.h" #if FFX_HALF FfxFloat16 FfxMed9(FfxFloat16 a, FfxFloat16 b, FfxFloat16 c, FfxFloat16 d, FfxFloat16 e, FfxFloat16 f, FfxFloat16 g, FfxFloat16 h, FfxFloat16 i) { // TODO (perf): compiler does not use med3 at all for these FfxFloat16 hi_lo = ffxMax3Half(ffxMin3Half(a, b, c), ffxMin3Half(d, e, f), ffxMin3Half(g, h, i)); FfxFloat16 mi_mi = ffxMed3Half(ffxMed3Half(a, b, c), ffxMed3Half(d, e, f), ffxMed3Half(g, h, i)); FfxFloat16 lo_hi = ffxMin3Half(ffxMax3Half(a, b, c), ffxMax3Half(d, e, f), ffxMax3Half(g, h, i)); return ffxMed3Half(hi_lo, mi_mi, lo_hi); } #endif FfxFloat32 FfxMed9(FfxFloat32 a, FfxFloat32 b, FfxFloat32 c, FfxFloat32 d, FfxFloat32 e, FfxFloat32 f, FfxFloat32 g, FfxFloat32 h, FfxFloat32 i) { // TODO (perf): compiler does not use med3 at all for these FfxFloat32 hi_lo = ffxMax3(ffxMin3(a, b, c), ffxMin3(d, e, f), ffxMin3(g, h, i)); FfxFloat32 mi_mi = ffxMed3(ffxMed3(a, b, c), ffxMed3(d, e, f), ffxMed3(g, h, i)); FfxFloat32 lo_hi = ffxMin3(ffxMax3(a, b, c), ffxMax3(d, e, f), ffxMax3(g, h, i)); return ffxMed3(hi_lo, mi_mi, lo_hi); } FfxFloat32 FfxCubicSpline(FfxFloat32 x) { // evaluate cubic spline -2x^3 + 3x^2 return x * x * (3 - 2 * x); } FFX_STATIC const FfxUInt32 FFX_DOF_COMBINE_TILE_SIZE = 8; FFX_STATIC const FfxUInt32 FFX_DOF_COMBINE_ROW_PITCH = FFX_DOF_COMBINE_TILE_SIZE + 3; // Add +2 for 3x3 filter margin, +1 on one side for bilinear filter. FFX_STATIC const FfxUInt32 FFX_DOF_COMBINE_AREA = FFX_DOF_COMBINE_ROW_PITCH * FFX_DOF_COMBINE_ROW_PITCH; #if FFX_HALF FFX_GROUPSHARED FfxUInt32 FfxDofLDSLuma[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxUInt32 FfxDofLDSNearRG[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxUInt32 FfxDofLDSNearBA[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxUInt32 FfxDofLDSFarRG[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxUInt32 FfxDofLDSFarBA[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxUInt32 FfxDofLDSFullColorRG[18*18]; FFX_GROUPSHARED FfxFloat16 FfxDofLDSFullColorB[18*18]; #else // #if FFX_HALF FFX_GROUPSHARED FfxFloat32 FfxDofLDSNearLuma[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxFloat32 FfxDofLDSFarLuma[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxFloat32 FfxDofLDSNearR[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxFloat32 FfxDofLDSNearG[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxFloat32 FfxDofLDSNearB[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxFloat32 FfxDofLDSNearA[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxFloat32 FfxDofLDSFarR[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxFloat32 FfxDofLDSFarG[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxFloat32 FfxDofLDSFarB[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxFloat32 FfxDofLDSFarA[FFX_DOF_COMBINE_AREA]; FFX_GROUPSHARED FfxFloat32 FfxDofLDSFullColorR[18*18]; FFX_GROUPSHARED FfxFloat32 FfxDofLDSFullColorG[18*18]; FFX_GROUPSHARED FfxFloat32 FfxDofLDSFullColorB[18*18]; #endif // #if FFX_HALF #else #if FFX_HALF FfxFloat16x4 FfxDofGetIntermediateNearColor(FfxUInt32 idx) { FfxFloat16x2 rg = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSNearRG[idx]); FfxFloat16x2 ba = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSNearBA[idx]); return FfxFloat16x4(rg, ba); } FfxFloat16x4 FfxDofGetIntermediateFarColor(FfxUInt32 idx) { FfxFloat16x2 rg = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSFarRG[idx]); FfxFloat16x2 ba = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSFarBA[idx]); return FfxFloat16x4(rg, ba); } FfxFloat16x3 FfxDofGetIntFullColor(FfxUInt32 idx) { FfxFloat16x2 rg = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSFullColorRG[idx]); FfxFloat16 b = FfxDofLDSFullColorB[idx]; return FfxFloat16x3(rg, b); } void FfxDofSetIntNearLuma(FfxUInt32 idx, FfxFloat16 luma) { FfxFloat16x2 unpacked = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSLuma[idx]); unpacked.x = luma; FfxDofLDSLuma[idx] = FFX_FLOAT16X2_TO_UINT32(unpacked); } void FfxDofSetIntFarLuma(FfxUInt32 idx, FfxFloat16 luma) { FfxFloat16x2 unpacked = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSLuma[idx]); unpacked.y = luma; FfxDofLDSLuma[idx] = FFX_FLOAT16X2_TO_UINT32(unpacked); } void FfxDofSetIntermediateNearColor(FfxUInt32 idx, FfxFloat16x4 col) { FfxDofLDSNearRG[idx] = FFX_FLOAT16X2_TO_UINT32(col.rg); FfxDofLDSNearBA[idx] = FFX_FLOAT16X2_TO_UINT32(col.ba); } void FfxDofSetIntermediateFarColor(FfxUInt32 idx, FfxFloat16x4 col) { FfxDofLDSFarRG[idx] = FFX_FLOAT16X2_TO_UINT32(col.rg); FfxDofLDSFarBA[idx] = FFX_FLOAT16X2_TO_UINT32(col.ba); } void FfxDofSetIntFullColor(FfxUInt32 idx, FfxFloat16x3 col) { FfxDofLDSFullColorRG[idx] = FFX_FLOAT16X2_TO_UINT32(col.rg); FfxDofLDSFullColorB[idx] = col.b; } FfxFloat16 FfxDofGetIntermediateNearAlpha(FfxUInt32 idx, FfxUInt32 offX, FfxUInt32 offY) { return FfxDofGetIntermediateNearColor(idx + offX + FFX_DOF_COMBINE_ROW_PITCH * offY).a; } FfxFloat16 FfxDofGetIntermediateFarAlpha(FfxUInt32 idx) { return FfxDofGetIntermediateFarColor(idx).a; } #else // #if FFX_HALF FfxFloat32x4 FfxDofGetIntermediateNearColor(FfxUInt32 idx) { return FfxFloat32x4(FfxDofLDSNearR[idx], FfxDofLDSNearG[idx], FfxDofLDSNearB[idx], FfxDofLDSNearA[idx]); } FfxFloat32x4 FfxDofGetIntermediateFarColor(FfxUInt32 idx) { return FfxFloat32x4(FfxDofLDSFarR[idx], FfxDofLDSFarG[idx], FfxDofLDSFarB[idx], FfxDofLDSFarA[idx]); } FfxFloat32x3 FfxDofGetIntFullColor(FfxUInt32 idx) { return FfxFloat32x3(FfxDofLDSFullColorR[idx], FfxDofLDSFullColorG[idx], FfxDofLDSFullColorB[idx]); } void FfxDofSetIntNearLuma(FfxUInt32 idx, FfxFloat32 luma) { FfxDofLDSNearLuma[idx] = luma; } void FfxDofSetIntFarLuma(FfxUInt32 idx, FfxFloat32 luma) { FfxDofLDSFarLuma[idx] = luma; } void FfxDofSetIntermediateNearColor(FfxUInt32 idx, FfxFloat32x4 col) { FfxDofLDSNearR[idx] = col.r; FfxDofLDSNearG[idx] = col.g; FfxDofLDSNearB[idx] = col.b; FfxDofLDSNearA[idx] = col.a; } void FfxDofSetIntermediateFarColor(FfxUInt32 idx, FfxFloat32x4 col) { FfxDofLDSFarR[idx] = col.r; FfxDofLDSFarG[idx] = col.g; FfxDofLDSFarB[idx] = col.b; FfxDofLDSFarA[idx] = col.a; } void FfxDofSetIntFullColor(FfxUInt32 idx, FfxFloat32x3 col) { FfxDofLDSFullColorR[idx] = col.r; FfxDofLDSFullColorG[idx] = col.g; FfxDofLDSFullColorB[idx] = col.b; } FfxFloat32 FfxDofGetIntermediateNearAlpha(FfxUInt32 idx, FfxUInt32 offX, FfxUInt32 offY) { return FfxDofGetIntermediateNearColor(idx + offX + FFX_DOF_COMBINE_ROW_PITCH * offY).a; } FfxFloat32 FfxDofGetIntermediateFarAlpha(FfxUInt32 idx) { return FfxDofGetIntermediateFarColor(idx).a; } #endif // #if FFX_HALF #else FfxHalfOpt3 FfxDofBlur3x3(FfxUInt32 baseIdx) { // kernel coefficients based on coverage of a circle in a 3x3 grid const FfxHalfOpt CORNER = FfxHalfOpt(0.5453); const FfxHalfOpt SIDE = FfxHalfOpt(0.9717); // accumulate convolution const FfxHalfOpt weights_sum = FfxHalfOpt(1) + FfxHalfOpt(4) * CORNER + FfxHalfOpt(4) * SIDE; FfxHalfOpt3 sum = FfxDofGetIntFullColor(baseIdx + 19) + CORNER * (FfxDofGetIntFullColor(baseIdx) + FfxDofGetIntFullColor(baseIdx + 2) + FfxDofGetIntFullColor(baseIdx + 36) + FfxDofGetIntFullColor(baseIdx + 38)) + SIDE * (FfxDofGetIntFullColor(baseIdx + 1) + FfxDofGetIntFullColor(baseIdx + 18) + FfxDofGetIntFullColor(baseIdx + 20) + FfxDofGetIntFullColor(baseIdx + 37)); return sum / weights_sum; } FfxHalfOpt4 FfxDofFinalCombineColors(FfxUInt32x2 coord, FfxUInt32x2 relCoord, FfxHalfOpt4 bg, FfxHalfOpt4 fg, FfxHalfOpt minFgW) { FfxFloat32 d = FfxDofLoadFullDepth(coord); FfxUInt32 baseIdx = relCoord.x + 18 * relCoord.y; FfxHalfOpt3 full = FfxHalfOpt3(FfxDofGetIntFullColor(baseIdx + 19)); FfxHalfOpt3 fixBlurred = FfxHalfOpt3(FfxDofBlur3x3(baseIdx)); // expand background around edges if (bg.a > FfxHalfOpt(0)) bg.rgb /= bg.a; // if any FG sample has zero weight, the interpolation is invalid. if (minFgW == FfxHalfOpt(0)) fg.a = FfxHalfOpt(0); FfxHalfOpt c = FfxHalfOpt(2) * FfxHalfOpt(abs(FfxDofGetCoc(d))); // double it for full-res pixels FfxHalfOpt c1 = ffxSaturate(c - FfxHalfOpt(0.5)); // lerp factor for full vs. fixed 1.5px blur FfxHalfOpt c2 = ffxSaturate(c - FfxHalfOpt(1.5)); // lerp factor for prev vs. quarter res if (bg.a == FfxHalfOpt(0)) c2 = FfxHalfOpt(0); FfxHalfOpt3 combinedColor = ffxLerp(full, fixBlurred, c1); combinedColor = ffxLerp(combinedColor, bg.rgb, c2); combinedColor = ffxLerp(combinedColor, fg.rgb, FfxHalfOpt(FfxCubicSpline(fg.a))); return FfxHalfOpt4(combinedColor, 1); } #if FFX_HALF FfxFloat16 FfxDofGetLDSNearLuma(FfxUInt32 idx, FfxUInt32 offX, FfxUInt32 offY) { return FFX_UINT32_TO_FLOAT16X2(FfxDofLDSLuma[idx + offX + FFX_DOF_COMBINE_ROW_PITCH * offY]).x; } FfxFloat16 FfxDofGetLDSFarLuma(FfxUInt32 idx, FfxUInt32 offX, FfxUInt32 offY) { return FFX_UINT32_TO_FLOAT16X2(FfxDofLDSLuma[idx + offX + FFX_DOF_COMBINE_ROW_PITCH * offY]).y; } #else // #if FFX_HALF FfxFloat32 FfxDofGetLDSNearLuma(FfxUInt32 idx, FfxUInt32 offX, FfxUInt32 offY) { return FfxDofLDSNearLuma[idx + offX + FFX_DOF_COMBINE_ROW_PITCH * offY]; } FfxFloat32 FfxDofGetLDSFarLuma(FfxUInt32 idx, FfxUInt32 offX, FfxUInt32 offY) { return FfxDofLDSFarLuma[idx + offX + FFX_DOF_COMBINE_ROW_PITCH * offY]; } #endif // #if FFX_HALF #else FfxHalfOpt4 FfxDofFilterFF(FfxUInt32 baseIdx) { // get the median of the surrounding 3x3 area of luma values FfxHalfOpt med_luma = FfxMed9( FfxDofGetLDSFarLuma(baseIdx, 0, 0), FfxDofGetLDSFarLuma(baseIdx, 1, 0), FfxDofGetLDSFarLuma(baseIdx, 2, 0), FfxDofGetLDSFarLuma(baseIdx, 0, 1), FfxDofGetLDSFarLuma(baseIdx, 1, 1), FfxDofGetLDSFarLuma(baseIdx, 2, 1), FfxDofGetLDSFarLuma(baseIdx, 0, 2), FfxDofGetLDSFarLuma(baseIdx, 1, 2), FfxDofGetLDSFarLuma(baseIdx, 2, 2)); FfxUInt32 idx = baseIdx + FFX_DOF_COMBINE_ROW_PITCH + 1; FfxHalfOpt3 col = FfxDofGetIntermediateFarColor(idx).rgb; FfxHalfOpt lumaFactor = clamp(med_luma / FfxDofGetLDSFarLuma(idx, 0, 0), FfxHalfOpt(0), FfxHalfOpt(2)); // corner fix: if color pixel is on a corner (has 5 black pixels as neighbor), don't reduce color. if (med_luma == FfxHalfOpt(0)) lumaFactor = FfxHalfOpt(1); return FfxHalfOpt4(col * lumaFactor, FfxDofGetIntermediateFarAlpha(idx)); } FfxHalfOpt4 FfxDofFilterNF(FfxUInt32 baseIdx) { // Get 3x3 median luma FfxHalfOpt med_luma = FfxMed9( FfxDofGetLDSNearLuma(baseIdx, 0, 0), FfxDofGetLDSNearLuma(baseIdx, 1, 0), FfxDofGetLDSNearLuma(baseIdx, 2, 0), FfxDofGetLDSNearLuma(baseIdx, 0, 1), FfxDofGetLDSNearLuma(baseIdx, 1, 1), FfxDofGetLDSNearLuma(baseIdx, 2, 1), FfxDofGetLDSNearLuma(baseIdx, 0, 2), FfxDofGetLDSNearLuma(baseIdx, 1, 2), FfxDofGetLDSNearLuma(baseIdx, 2, 2)); FfxHalfOpt avg_alpha = FfxHalfOpt(ffxReciprocal(9.0)) * ( FfxDofGetIntermediateNearAlpha(baseIdx, 0, 0) + FfxDofGetIntermediateNearAlpha(baseIdx, 1, 0) + FfxDofGetIntermediateNearAlpha(baseIdx, 2, 0) + FfxDofGetIntermediateNearAlpha(baseIdx, 0, 1) + FfxDofGetIntermediateNearAlpha(baseIdx, 1, 1) + FfxDofGetIntermediateNearAlpha(baseIdx, 2, 1) + FfxDofGetIntermediateNearAlpha(baseIdx, 0, 2) + FfxDofGetIntermediateNearAlpha(baseIdx, 1, 2) + FfxDofGetIntermediateNearAlpha(baseIdx, 2, 2)); FfxUInt32 idx = baseIdx + FFX_DOF_COMBINE_ROW_PITCH + 1; if (FfxDofGetIntermediateNearAlpha(idx, 0, 0) < 0.01) { // center has zero weight, grab one of the corner colors FfxUInt32 maxIdx = baseIdx; if (FfxDofGetLDSNearLuma(baseIdx, 2, 0) > FfxDofGetLDSNearLuma(maxIdx, 0, 0)) maxIdx = baseIdx + 2; if (FfxDofGetLDSNearLuma(baseIdx, 0, 2) > FfxDofGetLDSNearLuma(maxIdx, 0, 0)) maxIdx = baseIdx + 2 * FFX_DOF_COMBINE_ROW_PITCH; if (FfxDofGetLDSNearLuma(baseIdx, 2, 2) > FfxDofGetLDSNearLuma(maxIdx, 0, 0)) maxIdx = baseIdx + 2 * FFX_DOF_COMBINE_ROW_PITCH + 2; idx = maxIdx; } FfxHalfOpt3 col = FfxDofGetIntermediateNearColor(idx).rgb; FfxHalfOpt lumaFactor = med_luma > FfxHalfOpt(0) ? clamp(med_luma / FfxDofGetLDSNearLuma(idx, 0, 0), FfxHalfOpt(0), FfxHalfOpt(2)) : FfxHalfOpt(1.0); return FfxHalfOpt4(col.rgb * lumaFactor, avg_alpha); } FfxFloat32x2 FfxDofGetTileRadius(FfxUInt32x2 group) { // need to read 4 values FfxUInt32x2 tile = group * 2; FfxFloat32x2 a = FfxDofLoadDilatedRadius(tile); FfxFloat32x2 b = FfxDofLoadDilatedRadius(tile + FfxUInt32x2(0, 1)); FfxFloat32x2 c = FfxDofLoadDilatedRadius(tile + FfxUInt32x2(1, 0)); FfxFloat32x2 d = FfxDofLoadDilatedRadius(tile + FfxUInt32x2(1, 1)); FfxFloat32 near = max(a.x, ffxMax3(b.x, c.x, d.x)); FfxFloat32 far = min(a.y, ffxMin3(b.y, c.y, d.y)); return FfxFloat32x2(near, far); } void FfxDofCombineSharpOnly(FfxUInt32x2 group, FfxUInt32x2 thread) { #if !defined(FFX_DOF_OPTION_COMBINE_IN_PLACE) || !FFX_DOF_OPTION_COMBINE_IN_PLACE FfxUInt32x2 base = 16 * group; FfxDofStoreOutput(base + thread + FfxUInt32x2(0, 0), FfxDofLoadFullInput(base + thread + FfxUInt32x2(0, 0))); FfxDofStoreOutput(base + thread + FfxUInt32x2(8, 0), FfxDofLoadFullInput(base + thread + FfxUInt32x2(8, 0))); FfxDofStoreOutput(base + thread + FfxUInt32x2(0, 8), FfxDofLoadFullInput(base + thread + FfxUInt32x2(0, 8))); FfxDofStoreOutput(base + thread + FfxUInt32x2(8, 8), FfxDofLoadFullInput(base + thread + FfxUInt32x2(8, 8))); #endif } void FfxDofFetchFullColor(FfxUInt32x2 gid, FfxUInt32 gix, FfxUInt32x2 imageSize) { FFX_DOF_UNROLL for (FfxUInt32 iter = 0; iter < 6; iter++) { FfxUInt32 iFetch = (gix + iter * 64) % (18 * 18); FfxInt32x2 coord = FfxInt32x2(gid * 16) + FfxInt32x2(iFetch % 18 - 1, iFetch / 18 - 1); coord = clamp(coord, FfxInt32x2(0, 0), FfxInt32x2(imageSize) - FfxInt32x2(1, 1)); FfxHalfOpt3 color = FfxHalfOpt3(FfxDofLoadFullInput(coord).rgb); FfxDofSetIntFullColor(iFetch, color); } } void FfxDofSwizQuad(inout FfxUInt32x2 a, inout FfxUInt32x2 b, inout FfxUInt32x2 c, inout FfxUInt32x2 d) { // Input: four color values in a quad. // Re-orders the output to a swizzled format for better store throughput. // This maps from one quad per lane, stored in four separate registers to // four 16x2 regions (one per register). // This is done in two steps. First, permute the values among the lanes // using WaveReadLaneAt. Second, swap values between registers. // This only works for lane counts >= 32, do nothing otherwise for compatibility #if FFX_HLSL if (WaveGetLaneCount() < 32) return; FfxUInt32 lane = WaveGetLaneIndex(); // index for A, switch bits around 43210 -> 10432. FfxUInt32 idxA = ((lane & 3) << 3) + (lane >> 2); // Adding 8/16/24 for B/C/D makes each variable offset from the previous by one slot. a = WaveReadLaneAt(a, (lane & ~31) + (idxA + 0) % 32); b = WaveReadLaneAt(b, (lane & ~31) + (idxA + 8) % 32); c = WaveReadLaneAt(c, (lane & ~31) + (idxA + 16) % 32); d = WaveReadLaneAt(d, (lane & ~31) + (idxA + 24) % 32); #elif FFX_GLSL if (gl_SubgroupSize < 32) return; FfxUInt32 lane = gl_SubgroupInvocationID; FfxUInt32 idxA = ((lane & 3) << 3) + (lane >> 2); a = subgroupShuffle(a, (lane & ~31) + (idxA + 0) % 32); b = subgroupShuffle(b, (lane & ~31) + (idxA + 8) % 32); c = subgroupShuffle(c, (lane & ~31) + (idxA + 16) % 32); d = subgroupShuffle(d, (lane & ~31) + (idxA + 24) % 32); #endif // Now, for each lane, a/b/c/d contain one value from each of the four 16x2 lines. // And each group of 4 lanes have values from the same quads. // We just need to shuffle between abcd, so that each set of 4 lanes contains one quad per variable. // General idea: rotate by (lane % 4) variables. if ((lane & 1) != 0) { // rotate A->B->C->D->A FfxUInt32x2 tmp = d; d = c; c = b; b = a; a = tmp; } if ((lane & 2) != 0) { // swap A<->C and B<->D FfxUInt32x2 tmp = a; a = c; c = tmp; tmp = b; b = d; d = tmp; } } #if FFX_HALF void FfxDofSwizQuad(inout FfxFloat16x4 a, inout FfxFloat16x4 b, inout FfxFloat16x4 c, inout FfxFloat16x4 d) { // Same as above. FfxUInt32x2 packed_a = FFX_FLOAT16X4_TO_UINT32X2(a); FfxUInt32x2 packed_b = FFX_FLOAT16X4_TO_UINT32X2(b); FfxUInt32x2 packed_c = FFX_FLOAT16X4_TO_UINT32X2(c); FfxUInt32x2 packed_d = FFX_FLOAT16X4_TO_UINT32X2(d); FfxDofSwizQuad(packed_a, packed_b, packed_c, packed_d); a = FFX_UINT32X2_TO_FLOAT16X4(packed_a); b = FFX_UINT32X2_TO_FLOAT16X4(packed_b); c = FFX_UINT32X2_TO_FLOAT16X4(packed_c); d = FFX_UINT32X2_TO_FLOAT16X4(packed_d); } #else // #if FFX_HALF void FfxDofSwizQuad(inout FfxFloat32x4 a, inout FfxFloat32x4 b, inout FfxFloat32x4 c, inout FfxFloat32x4 d) { // Same as above. FfxUInt32x2 a0 = ffxAsUInt32(a.xy); FfxUInt32x2 a1 = ffxAsUInt32(a.zw); FfxUInt32x2 b0 = ffxAsUInt32(b.xy); FfxUInt32x2 b1 = ffxAsUInt32(b.zw); FfxUInt32x2 c0 = ffxAsUInt32(c.xy); FfxUInt32x2 c1 = ffxAsUInt32(c.zw); FfxUInt32x2 d0 = ffxAsUInt32(d.xy); FfxUInt32x2 d1 = ffxAsUInt32(d.zw); FfxDofSwizQuad(a0, b0, c0, d0); FfxDofSwizQuad(a1, b1, c1, d1); a = FfxFloat32x4(ffxAsFloat(a0), ffxAsFloat(a1)); b = FfxFloat32x4(ffxAsFloat(b0), ffxAsFloat(b1)); c = FfxFloat32x4(ffxAsFloat(c0), ffxAsFloat(c1)); d = FfxFloat32x4(ffxAsFloat(d0), ffxAsFloat(d1)); } #endif // #if FFX_HALF #else void FfxDofCombineFarOnly(FfxUInt32x2 id, FfxUInt32x2 gtID, FfxUInt32x2 gid, FfxUInt32 gix, FfxUInt32x2 imageSize) { // TODO: Is this the best configuration for fetching? FFX_DOF_UNROLL_N(2) for (FfxUInt32 iter = 0; iter < 2; iter++) { // HACK: with the modulo, we will re-fetch some pixels, but might be better than waiting twice (latency-wise) // which is what the compiler would do if it had to possibly branch FfxUInt32 iFetch = (gix + iter * 64) % FFX_DOF_COMBINE_AREA; FfxInt32x2 coord = FfxInt32x2(gid * FFX_DOF_COMBINE_TILE_SIZE) + FfxInt32x2(iFetch % FFX_DOF_COMBINE_ROW_PITCH - 1, iFetch / FFX_DOF_COMBINE_ROW_PITCH - 1); coord = clamp(coord, FfxInt32x2(0, 0), FfxInt32x2(imageSize - 1)); FfxHalfOpt4 ffColor = FfxHalfOpt4(FfxDofLoadFar(coord)); // calculate and store luma for later median calculation FfxHalfOpt ffLuma = FfxHalfOpt(0.2126) * ffColor.r + FfxHalfOpt(0.7152) * ffColor.g + FfxHalfOpt(0.0722) * ffColor.b; FfxDofSetIntFarLuma(iFetch, ffLuma); FfxDofSetIntermediateFarColor(iFetch, ffColor); } FFX_GROUP_MEMORY_BARRIER; const FfxUInt32 baseIdx = gtID.x + gtID.y * FFX_DOF_COMBINE_ROW_PITCH; // one extra round of filtering needs to be done around the edge, this index maps to that. // TODO: This is ugly and possibly slow const FfxUInt32 baseIdx2 = (FFX_DOF_COMBINE_TILE_SIZE + FFX_DOF_COMBINE_ROW_PITCH * gix + (gix / (FFX_DOF_COMBINE_TILE_SIZE + 1)) * ((gix - FFX_DOF_COMBINE_TILE_SIZE) * (-FFX_DOF_COMBINE_ROW_PITCH + 1) - (FFX_DOF_COMBINE_TILE_SIZE + 1))) % FFX_DOF_COMBINE_AREA; FfxHalfOpt4 ffColor = FfxHalfOpt4(0, 0, 0, 0), ffColor2 = FfxHalfOpt4(0, 0, 0, 0); // far-field post-filter ffColor = FfxDofFilterFF(baseIdx); ffColor2 = gix < (2 * FFX_DOF_COMBINE_TILE_SIZE + 1) ? FfxDofFilterFF(baseIdx2) : FfxHalfOpt4(0, 0, 0, 0); FFX_GROUP_MEMORY_BARRIER; // write out colors for interpolation FfxDofSetIntermediateFarColor(baseIdx, ffColor); if (gix < (2 * FFX_DOF_COMBINE_TILE_SIZE + 1)) { FfxDofSetIntermediateFarColor(baseIdx2, ffColor2); } FFX_GROUP_MEMORY_BARRIER; // upscaling FfxHalfOpt4 ffTR = FfxHalfOpt4(0, 0, 0, 0), ffBL = FfxHalfOpt4(0, 0, 0, 0), ffBR = FfxHalfOpt4(0, 0, 0, 0); ffTR = FfxHalfOpt(0.5) * ffColor + FfxHalfOpt(0.5) * FfxDofGetIntermediateFarColor(baseIdx + 1); ffBL = FfxHalfOpt(0.5) * ffColor + FfxHalfOpt(0.5) * FfxDofGetIntermediateFarColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH); ffBR = FfxHalfOpt(0.5) * ffTR + FfxHalfOpt(0.25) * FfxDofGetIntermediateFarColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH) + FfxHalfOpt(0.25) * FfxDofGetIntermediateFarColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH + 1); // top-left pixel FfxUInt32x2 coord = 2 * id; FfxUInt32x2 relCoord = 2 * gtID; FfxUInt32x2 coordA = coord; FfxHalfOpt4 colA = FfxDofFinalCombineColors(coord, relCoord, ffColor, FfxHalfOpt4(0, 0, 0, 0), FfxHalfOpt(0)); // top-right coord.x++; relCoord.x++; FfxUInt32x2 coordB = coord; FfxHalfOpt4 colB = FfxDofFinalCombineColors(coord, relCoord, ffTR, FfxHalfOpt4(0, 0, 0, 0), FfxHalfOpt(0)); // bottom-right coord.y++; relCoord.y++; FfxUInt32x2 coordC = coord; FfxHalfOpt4 colC = FfxDofFinalCombineColors(coord, relCoord, ffBR, FfxHalfOpt4(0, 0, 0, 0), FfxHalfOpt(0)); // bottom-left coord.x--; relCoord.x--; FfxUInt32x2 coordD = coord; FfxHalfOpt4 colD = FfxDofFinalCombineColors(coord, relCoord, ffBL, FfxHalfOpt4(0, 0, 0, 0), FfxHalfOpt(0)); // TODO: Navi3 should make swizzling unnecessary because it supports write-combining clauses FfxDofSwizQuad(colA, colB, colC, colD); FfxDofSwizQuad(coordA, coordB, coordC, coordD); FfxDofStoreOutput(coordA, colA); FfxDofStoreOutput(coordB, colB); FfxDofStoreOutput(coordC, colC); FfxDofStoreOutput(coordD, colD); } void FfxDofCombineAll(FfxUInt32x2 id, FfxUInt32x2 gtID, FfxUInt32x2 gid, FfxUInt32 gix, FfxUInt32x2 imageSize) { // TODO: Is this the best configuration for fetching? FFX_DOF_UNROLL_N(2) for (FfxUInt32 iter = 0; iter < 2; iter++) { // HACK: with the modulo, we will re-fetch some pixels, but might be better than waiting twice (latency-wise) // which is what the compiler would do if it had to possibly branch FfxUInt32 iFetch = (gix + iter * 64) % FFX_DOF_COMBINE_AREA; FfxInt32x2 coord = FfxInt32x2(gid * FFX_DOF_COMBINE_TILE_SIZE) + FfxInt32x2(iFetch % FFX_DOF_COMBINE_ROW_PITCH - 1, iFetch / FFX_DOF_COMBINE_ROW_PITCH - 1); coord = clamp(coord, FfxInt32x2(0, 0), FfxInt32x2(imageSize - 1)); FfxHalfOpt4 ffColor = FfxHalfOpt4(FfxDofLoadFar(coord)); FfxHalfOpt4 nfColor = FfxHalfOpt4(FfxDofLoadNear(coord)); // calculate and store luma for later median calculation FfxHalfOpt ffLuma = FfxHalfOpt(0.2126) * ffColor.r + FfxHalfOpt(0.7152) * ffColor.g + FfxHalfOpt(0.0722) * ffColor.b; FfxHalfOpt nfLuma = FfxHalfOpt(0.2126) * nfColor.r + FfxHalfOpt(0.7152) * nfColor.g + FfxHalfOpt(0.0722) * nfColor.b; FfxDofSetIntFarLuma(iFetch, ffLuma); FfxDofSetIntNearLuma(iFetch, nfLuma); FfxDofSetIntermediateFarColor(iFetch, ffColor); FfxDofSetIntermediateNearColor(iFetch, nfColor); } FFX_GROUP_MEMORY_BARRIER; const FfxUInt32 baseIdx = gtID.x + gtID.y * FFX_DOF_COMBINE_ROW_PITCH; // one extra round of filtering needs to be done around the edge, this index maps to that. // TODO: same as above, ugly and slow. const FfxUInt32 baseIdx2 = (FFX_DOF_COMBINE_TILE_SIZE + FFX_DOF_COMBINE_ROW_PITCH * gix + (gix / (FFX_DOF_COMBINE_TILE_SIZE + 1)) * ((gix - FFX_DOF_COMBINE_TILE_SIZE) * (-FFX_DOF_COMBINE_ROW_PITCH + 1) - (FFX_DOF_COMBINE_TILE_SIZE + 1))) % FFX_DOF_COMBINE_AREA; FfxHalfOpt4 ffColor = FfxHalfOpt4(0, 0, 0, 0), ffColor2 = FfxHalfOpt4(0, 0, 0, 0), nfColor = FfxHalfOpt4(0, 0, 0, 0), nfColor2 = FfxHalfOpt4(0, 0, 0, 0); // far-field post-filter ffColor = FfxDofFilterFF(baseIdx); ffColor2 = gix < (2 * FFX_DOF_COMBINE_TILE_SIZE + 1) ? FfxDofFilterFF(baseIdx2) : FfxHalfOpt4(0, 0, 0, 0); // near-field post-filter nfColor = FfxDofFilterNF(baseIdx); nfColor2 = gix < (2 * FFX_DOF_COMBINE_TILE_SIZE + 1) ? FfxDofFilterNF(baseIdx2) : FfxHalfOpt4(0, 0, 0, 0); FFX_GROUP_MEMORY_BARRIER; // write out colors for interpolation FfxDofSetIntermediateNearColor(baseIdx, nfColor); FfxDofSetIntermediateFarColor(baseIdx, ffColor); if (gix < (2 * FFX_DOF_COMBINE_TILE_SIZE + 1)) { FfxDofSetIntermediateNearColor(baseIdx2, nfColor2); FfxDofSetIntermediateFarColor(baseIdx2, ffColor2); } FFX_GROUP_MEMORY_BARRIER; // if any FG sample has zero weight, the interpolation is invalid. // take the min and invalidate if zero (see CombineColors) FfxHalfOpt fgMinW = min(nfColor.a, FfxHalfOpt(ffxMin3(FfxDofGetIntermediateNearAlpha(baseIdx, 1, 0), FfxDofGetIntermediateNearAlpha(baseIdx, 0, 1), FfxDofGetIntermediateNearAlpha(baseIdx, 1, 1)))); // upscaling FfxHalfOpt4 nfTR = FfxHalfOpt4(0, 0, 0, 0), nfBL = FfxHalfOpt4(0, 0, 0, 0), nfBR = FfxHalfOpt4(0, 0, 0, 0); nfTR = FfxHalfOpt(0.5) * nfColor + FfxHalfOpt(0.5) * FfxDofGetIntermediateNearColor(baseIdx + 1); nfBL = FfxHalfOpt(0.5) * nfColor + FfxHalfOpt(0.5) * FfxDofGetIntermediateNearColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH); nfBR = FfxHalfOpt(0.5) * nfTR + FfxHalfOpt(0.25) * FfxDofGetIntermediateNearColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH) + FfxHalfOpt(0.25) * FfxDofGetIntermediateNearColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH + 1); FfxHalfOpt4 ffTR = FfxHalfOpt4(0, 0, 0, 0), ffBL = FfxHalfOpt4(0, 0, 0, 0), ffBR = FfxHalfOpt4(0, 0, 0, 0); ffTR = FfxHalfOpt(0.5) * ffColor + FfxHalfOpt(0.5) * FfxDofGetIntermediateFarColor(baseIdx + 1); ffBL = FfxHalfOpt(0.5) * ffColor + FfxHalfOpt(0.5) * FfxDofGetIntermediateFarColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH); ffBR = FfxHalfOpt(0.5) * ffTR + FfxHalfOpt(0.25) * FfxDofGetIntermediateFarColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH) + FfxHalfOpt(0.25) * FfxDofGetIntermediateFarColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH + 1); // top-left pixel FfxUInt32x2 coord = 2 * id; FfxUInt32x2 relCoord = 2 * gtID; FfxUInt32x2 coordA = coord; FfxHalfOpt4 colA = FfxDofFinalCombineColors(coord, relCoord, ffColor, nfColor, fgMinW); // top-right coord.x += 1; relCoord.x += 1; FfxUInt32x2 coordB = coord; FfxHalfOpt4 colB = FfxDofFinalCombineColors(coord, relCoord, ffTR, nfTR, fgMinW); // bottom-right coord.y++; relCoord.y++; FfxUInt32x2 coordC = coord; FfxHalfOpt4 colC = FfxDofFinalCombineColors(coord, relCoord, ffBR, nfBR, fgMinW); // bottom-left coord.x--; relCoord.x--; FfxUInt32x2 coordD = coord; FfxHalfOpt4 colD = FfxDofFinalCombineColors(coord, relCoord, ffBL, nfBL, fgMinW); FfxDofSwizQuad(colA, colB, colC, colD); FfxDofSwizQuad(coordA, coordB, coordC, coordD); FfxDofStoreOutput(coordA, colA); FfxDofStoreOutput(coordB, colB); FfxDofStoreOutput(coordC, colC); FfxDofStoreOutput(coordD, colD); } /// Entry point. Meant to run in 8x8 threads and writes 16x16 output pixels. /// /// @param threadID SV_DispatchThreadID.xy /// @param groupThreadID SV_GroupThreadID.xy /// @param group SV_GroupID.xy /// @param index SV_GroupIndex /// @param halfImageSize Pixel size of the input (half resolution) /// @param fullImageSize Pixel size of the output (full resolution) /// @ingroup FfxGPUDof void FfxDofCombineHalfRes(FfxUInt32x2 threadID, FfxUInt32x2 groupThreadID, FfxUInt32x2 group, FfxUInt32 index, FfxUInt32x2 halfImageSize, FfxUInt32x2 fullImageSize) { // classify tile FfxFloat32x2 tileCoc = FfxDofGetTileRadius(group); FfxBoolean nearNeeded = tileCoc.x > -1.025; // halved due to resolution change, then: 2px = threshold in main pass + small inaccuracy bias FfxBoolean allSharp = max(abs(tileCoc.x), abs(tileCoc.y)) < 0.25; if (allSharp) { FfxDofCombineSharpOnly(group, groupThreadID); } else if (!nearNeeded) { FfxDofFetchFullColor(group, index, fullImageSize); FfxDofCombineFarOnly(threadID, groupThreadID, group, index, halfImageSize); } else { FfxDofFetchFullColor(group, index, fullImageSize); FfxDofCombineAll(threadID, groupThreadID, group, index, halfImageSize); } }