// This file is part of the FidelityFX SDK.
//
// Copyright (C) 2024 Advanced Micro Devices, Inc.
// 
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#include "ffx_core.h"
#include "ffx_dof_common.h"

#if FFX_HALF
FfxFloat16 FfxMed9(FfxFloat16 a, FfxFloat16 b, FfxFloat16 c,
	FfxFloat16 d, FfxFloat16 e, FfxFloat16 f,
	FfxFloat16 g, FfxFloat16 h, FfxFloat16 i)
{
	// TODO (perf): compiler does not use med3 at all for these
	FfxFloat16 hi_lo = ffxMax3Half(ffxMin3Half(a, b, c), ffxMin3Half(d, e, f), ffxMin3Half(g, h, i));
	FfxFloat16 mi_mi = ffxMed3Half(ffxMed3Half(a, b, c), ffxMed3Half(d, e, f), ffxMed3Half(g, h, i));
	FfxFloat16 lo_hi = ffxMin3Half(ffxMax3Half(a, b, c), ffxMax3Half(d, e, f), ffxMax3Half(g, h, i));
	return ffxMed3Half(hi_lo, mi_mi, lo_hi);
}
#endif

FfxFloat32 FfxMed9(FfxFloat32 a, FfxFloat32 b, FfxFloat32 c,
	FfxFloat32 d, FfxFloat32 e, FfxFloat32 f,
	FfxFloat32 g, FfxFloat32 h, FfxFloat32 i)
{
	// TODO (perf): compiler does not use med3 at all for these
	FfxFloat32 hi_lo = ffxMax3(ffxMin3(a, b, c), ffxMin3(d, e, f), ffxMin3(g, h, i));
	FfxFloat32 mi_mi = ffxMed3(ffxMed3(a, b, c), ffxMed3(d, e, f), ffxMed3(g, h, i));
	FfxFloat32 lo_hi = ffxMin3(ffxMax3(a, b, c), ffxMax3(d, e, f), ffxMax3(g, h, i));
	return ffxMed3(hi_lo, mi_mi, lo_hi);
}

FfxFloat32 FfxCubicSpline(FfxFloat32 x)
{
	// evaluate cubic spline -2x^3 + 3x^2
	return x * x * (3 - 2 * x);
}

FFX_STATIC const FfxUInt32 FFX_DOF_COMBINE_TILE_SIZE = 8;
FFX_STATIC const FfxUInt32 FFX_DOF_COMBINE_ROW_PITCH = FFX_DOF_COMBINE_TILE_SIZE + 3; // Add +2 for 3x3 filter margin, +1 on one side for bilinear filter.
FFX_STATIC const FfxUInt32 FFX_DOF_COMBINE_AREA = FFX_DOF_COMBINE_ROW_PITCH * FFX_DOF_COMBINE_ROW_PITCH;

#if FFX_HALF
FFX_GROUPSHARED FfxUInt32  FfxDofLDSLuma[FFX_DOF_COMBINE_AREA];
FFX_GROUPSHARED FfxUInt32  FfxDofLDSNearRG[FFX_DOF_COMBINE_AREA];
FFX_GROUPSHARED FfxUInt32  FfxDofLDSNearBA[FFX_DOF_COMBINE_AREA];
FFX_GROUPSHARED FfxUInt32  FfxDofLDSFarRG[FFX_DOF_COMBINE_AREA];
FFX_GROUPSHARED FfxUInt32  FfxDofLDSFarBA[FFX_DOF_COMBINE_AREA];

FFX_GROUPSHARED FfxUInt32  FfxDofLDSFullColorRG[18*18];
FFX_GROUPSHARED FfxFloat16 FfxDofLDSFullColorB[18*18];
#else // #if FFX_HALF
FFX_GROUPSHARED FfxFloat32  FfxDofLDSNearLuma[FFX_DOF_COMBINE_AREA];
FFX_GROUPSHARED FfxFloat32  FfxDofLDSFarLuma[FFX_DOF_COMBINE_AREA];
FFX_GROUPSHARED FfxFloat32  FfxDofLDSNearR[FFX_DOF_COMBINE_AREA];
FFX_GROUPSHARED FfxFloat32  FfxDofLDSNearG[FFX_DOF_COMBINE_AREA];
FFX_GROUPSHARED FfxFloat32  FfxDofLDSNearB[FFX_DOF_COMBINE_AREA];
FFX_GROUPSHARED FfxFloat32  FfxDofLDSNearA[FFX_DOF_COMBINE_AREA];

FFX_GROUPSHARED FfxFloat32  FfxDofLDSFarR[FFX_DOF_COMBINE_AREA];
FFX_GROUPSHARED FfxFloat32  FfxDofLDSFarG[FFX_DOF_COMBINE_AREA];
FFX_GROUPSHARED FfxFloat32  FfxDofLDSFarB[FFX_DOF_COMBINE_AREA];
FFX_GROUPSHARED FfxFloat32  FfxDofLDSFarA[FFX_DOF_COMBINE_AREA];

FFX_GROUPSHARED FfxFloat32  FfxDofLDSFullColorR[18*18];
FFX_GROUPSHARED FfxFloat32  FfxDofLDSFullColorG[18*18];
FFX_GROUPSHARED FfxFloat32  FfxDofLDSFullColorB[18*18];
#endif // #if FFX_HALF #else

#if FFX_HALF
FfxFloat16x4 FfxDofGetIntermediateNearColor(FfxUInt32 idx)
{
	FfxFloat16x2 rg = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSNearRG[idx]);
	FfxFloat16x2 ba = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSNearBA[idx]);
	return FfxFloat16x4(rg, ba);
}
FfxFloat16x4 FfxDofGetIntermediateFarColor(FfxUInt32 idx)
{
	FfxFloat16x2 rg = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSFarRG[idx]);
	FfxFloat16x2 ba = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSFarBA[idx]);
	return FfxFloat16x4(rg, ba);
}
FfxFloat16x3 FfxDofGetIntFullColor(FfxUInt32 idx)
{
	FfxFloat16x2 rg = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSFullColorRG[idx]);
	FfxFloat16 b = FfxDofLDSFullColorB[idx];
	return FfxFloat16x3(rg, b);
}
void FfxDofSetIntNearLuma(FfxUInt32 idx, FfxFloat16 luma)
{
	FfxFloat16x2 unpacked = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSLuma[idx]);
	unpacked.x = luma;
	FfxDofLDSLuma[idx] = FFX_FLOAT16X2_TO_UINT32(unpacked);
}
void FfxDofSetIntFarLuma(FfxUInt32 idx, FfxFloat16 luma)
{
	FfxFloat16x2 unpacked = FFX_UINT32_TO_FLOAT16X2(FfxDofLDSLuma[idx]);
	unpacked.y = luma;
	FfxDofLDSLuma[idx] = FFX_FLOAT16X2_TO_UINT32(unpacked);
}

void FfxDofSetIntermediateNearColor(FfxUInt32 idx, FfxFloat16x4 col)
{
	FfxDofLDSNearRG[idx] = FFX_FLOAT16X2_TO_UINT32(col.rg);
	FfxDofLDSNearBA[idx] = FFX_FLOAT16X2_TO_UINT32(col.ba);
}
void FfxDofSetIntermediateFarColor(FfxUInt32 idx, FfxFloat16x4 col)
{
	FfxDofLDSFarRG[idx] = FFX_FLOAT16X2_TO_UINT32(col.rg);
	FfxDofLDSFarBA[idx] = FFX_FLOAT16X2_TO_UINT32(col.ba);
}
void FfxDofSetIntFullColor(FfxUInt32 idx, FfxFloat16x3 col)
{
	FfxDofLDSFullColorRG[idx] = FFX_FLOAT16X2_TO_UINT32(col.rg);
	FfxDofLDSFullColorB[idx] = col.b;
}
FfxFloat16 FfxDofGetIntermediateNearAlpha(FfxUInt32 idx, FfxUInt32 offX, FfxUInt32 offY)
{
	return FfxDofGetIntermediateNearColor(idx + offX + FFX_DOF_COMBINE_ROW_PITCH * offY).a;
}
FfxFloat16 FfxDofGetIntermediateFarAlpha(FfxUInt32 idx)
{
	return FfxDofGetIntermediateFarColor(idx).a;
}
#else // #if FFX_HALF
FfxFloat32x4 FfxDofGetIntermediateNearColor(FfxUInt32 idx)
{
	return FfxFloat32x4(FfxDofLDSNearR[idx], FfxDofLDSNearG[idx], FfxDofLDSNearB[idx], FfxDofLDSNearA[idx]);
}
FfxFloat32x4 FfxDofGetIntermediateFarColor(FfxUInt32 idx)
{
	return FfxFloat32x4(FfxDofLDSFarR[idx], FfxDofLDSFarG[idx], FfxDofLDSFarB[idx], FfxDofLDSFarA[idx]);
}
FfxFloat32x3 FfxDofGetIntFullColor(FfxUInt32 idx)
{
	return FfxFloat32x3(FfxDofLDSFullColorR[idx], FfxDofLDSFullColorG[idx], FfxDofLDSFullColorB[idx]);
}
void FfxDofSetIntNearLuma(FfxUInt32 idx, FfxFloat32 luma)
{
	FfxDofLDSNearLuma[idx] = luma;
}
void FfxDofSetIntFarLuma(FfxUInt32 idx, FfxFloat32 luma)
{
	FfxDofLDSFarLuma[idx] = luma;
}

void FfxDofSetIntermediateNearColor(FfxUInt32 idx, FfxFloat32x4 col)
{
	FfxDofLDSNearR[idx] = col.r;
	FfxDofLDSNearG[idx] = col.g;
	FfxDofLDSNearB[idx] = col.b;
	FfxDofLDSNearA[idx] = col.a;
}
void FfxDofSetIntermediateFarColor(FfxUInt32 idx, FfxFloat32x4 col)
{
	FfxDofLDSFarR[idx] = col.r;
	FfxDofLDSFarG[idx] = col.g;
	FfxDofLDSFarB[idx] = col.b;
	FfxDofLDSFarA[idx] = col.a;
}
void FfxDofSetIntFullColor(FfxUInt32 idx, FfxFloat32x3 col)
{
	FfxDofLDSFullColorR[idx] = col.r;
	FfxDofLDSFullColorG[idx] = col.g;
	FfxDofLDSFullColorB[idx] = col.b;
}
FfxFloat32 FfxDofGetIntermediateNearAlpha(FfxUInt32 idx, FfxUInt32 offX, FfxUInt32 offY)
{
	return FfxDofGetIntermediateNearColor(idx + offX + FFX_DOF_COMBINE_ROW_PITCH * offY).a;
}
FfxFloat32 FfxDofGetIntermediateFarAlpha(FfxUInt32 idx)
{
	return FfxDofGetIntermediateFarColor(idx).a;
}
#endif // #if FFX_HALF #else

FfxHalfOpt3 FfxDofBlur3x3(FfxUInt32 baseIdx)
{
	// kernel coefficients based on coverage of a circle in a 3x3 grid
	const FfxHalfOpt CORNER = FfxHalfOpt(0.5453);
	const FfxHalfOpt SIDE = FfxHalfOpt(0.9717);

	// accumulate convolution
	const FfxHalfOpt weights_sum = FfxHalfOpt(1) + FfxHalfOpt(4) * CORNER + FfxHalfOpt(4) * SIDE;
	FfxHalfOpt3 sum = FfxDofGetIntFullColor(baseIdx + 19)
		+ CORNER * (FfxDofGetIntFullColor(baseIdx) + FfxDofGetIntFullColor(baseIdx + 2) + FfxDofGetIntFullColor(baseIdx + 36) + FfxDofGetIntFullColor(baseIdx + 38))
		+ SIDE * (FfxDofGetIntFullColor(baseIdx + 1) + FfxDofGetIntFullColor(baseIdx + 18) + FfxDofGetIntFullColor(baseIdx + 20) + FfxDofGetIntFullColor(baseIdx + 37));

	return sum / weights_sum;
}

FfxHalfOpt4 FfxDofFinalCombineColors(FfxUInt32x2 coord, FfxUInt32x2 relCoord, FfxHalfOpt4 bg, FfxHalfOpt4 fg, FfxHalfOpt minFgW)
{
	FfxFloat32 d = FfxDofLoadFullDepth(coord);
	FfxUInt32 baseIdx = relCoord.x + 18 * relCoord.y;
	FfxHalfOpt3 full = FfxHalfOpt3(FfxDofGetIntFullColor(baseIdx + 19));
	FfxHalfOpt3 fixBlurred = FfxHalfOpt3(FfxDofBlur3x3(baseIdx));

	// expand background around edges
	if (bg.a > FfxHalfOpt(0)) bg.rgb /= bg.a;
	// if any FG sample has zero weight, the interpolation is invalid.
	if (minFgW == FfxHalfOpt(0)) fg.a = FfxHalfOpt(0);
	FfxHalfOpt c = FfxHalfOpt(2) * FfxHalfOpt(abs(FfxDofGetCoc(d))); // double it for full-res pixels
	FfxHalfOpt c1 = ffxSaturate(c - FfxHalfOpt(0.5)); // lerp factor for full vs. fixed 1.5px blur
	FfxHalfOpt c2 = ffxSaturate(c - FfxHalfOpt(1.5)); // lerp factor for prev vs. quarter res
	if (bg.a == FfxHalfOpt(0)) c2 = FfxHalfOpt(0);
	FfxHalfOpt3 combinedColor = ffxLerp(full, fixBlurred, c1);
	combinedColor = ffxLerp(combinedColor, bg.rgb, c2);

	combinedColor = ffxLerp(combinedColor, fg.rgb, FfxHalfOpt(FfxCubicSpline(fg.a)));
	return FfxHalfOpt4(combinedColor, 1);
}

#if FFX_HALF
FfxFloat16 FfxDofGetLDSNearLuma(FfxUInt32 idx, FfxUInt32 offX, FfxUInt32 offY)
{
	return FFX_UINT32_TO_FLOAT16X2(FfxDofLDSLuma[idx + offX + FFX_DOF_COMBINE_ROW_PITCH * offY]).x;
}
FfxFloat16 FfxDofGetLDSFarLuma(FfxUInt32 idx, FfxUInt32 offX, FfxUInt32 offY)
{
	return FFX_UINT32_TO_FLOAT16X2(FfxDofLDSLuma[idx + offX + FFX_DOF_COMBINE_ROW_PITCH * offY]).y;
}
#else // #if FFX_HALF
FfxFloat32 FfxDofGetLDSNearLuma(FfxUInt32 idx, FfxUInt32 offX, FfxUInt32 offY)
{
	return FfxDofLDSNearLuma[idx + offX + FFX_DOF_COMBINE_ROW_PITCH * offY];
}
FfxFloat32 FfxDofGetLDSFarLuma(FfxUInt32 idx, FfxUInt32 offX, FfxUInt32 offY)
{
	return FfxDofLDSFarLuma[idx + offX + FFX_DOF_COMBINE_ROW_PITCH * offY];
}
#endif // #if FFX_HALF #else

FfxHalfOpt4 FfxDofFilterFF(FfxUInt32 baseIdx)
{
	// get the median of the surrounding 3x3 area of luma values
	FfxHalfOpt med_luma = FfxMed9(
		FfxDofGetLDSFarLuma(baseIdx, 0, 0), FfxDofGetLDSFarLuma(baseIdx, 1, 0), FfxDofGetLDSFarLuma(baseIdx, 2, 0),
		FfxDofGetLDSFarLuma(baseIdx, 0, 1), FfxDofGetLDSFarLuma(baseIdx, 1, 1), FfxDofGetLDSFarLuma(baseIdx, 2, 1),
		FfxDofGetLDSFarLuma(baseIdx, 0, 2), FfxDofGetLDSFarLuma(baseIdx, 1, 2), FfxDofGetLDSFarLuma(baseIdx, 2, 2));

	FfxUInt32 idx = baseIdx + FFX_DOF_COMBINE_ROW_PITCH + 1;
	FfxHalfOpt3 col = FfxDofGetIntermediateFarColor(idx).rgb;
	FfxHalfOpt lumaFactor = clamp(med_luma / FfxDofGetLDSFarLuma(idx, 0, 0), FfxHalfOpt(0), FfxHalfOpt(2));
	// corner fix: if color pixel is on a corner (has 5 black pixels as neighbor), don't reduce color.
	if (med_luma == FfxHalfOpt(0)) lumaFactor = FfxHalfOpt(1);
	return FfxHalfOpt4(col * lumaFactor, FfxDofGetIntermediateFarAlpha(idx));
}

FfxHalfOpt4 FfxDofFilterNF(FfxUInt32 baseIdx)
{
	// Get 3x3 median luma
	FfxHalfOpt med_luma = FfxMed9(
		FfxDofGetLDSNearLuma(baseIdx, 0, 0), FfxDofGetLDSNearLuma(baseIdx, 1, 0), FfxDofGetLDSNearLuma(baseIdx, 2, 0),
		FfxDofGetLDSNearLuma(baseIdx, 0, 1), FfxDofGetLDSNearLuma(baseIdx, 1, 1), FfxDofGetLDSNearLuma(baseIdx, 2, 1),
		FfxDofGetLDSNearLuma(baseIdx, 0, 2), FfxDofGetLDSNearLuma(baseIdx, 1, 2), FfxDofGetLDSNearLuma(baseIdx, 2, 2));
	FfxHalfOpt avg_alpha = FfxHalfOpt(ffxReciprocal(9.0)) * (
		FfxDofGetIntermediateNearAlpha(baseIdx, 0, 0) + FfxDofGetIntermediateNearAlpha(baseIdx, 1, 0) + FfxDofGetIntermediateNearAlpha(baseIdx, 2, 0) +
		FfxDofGetIntermediateNearAlpha(baseIdx, 0, 1) + FfxDofGetIntermediateNearAlpha(baseIdx, 1, 1) + FfxDofGetIntermediateNearAlpha(baseIdx, 2, 1) +
		FfxDofGetIntermediateNearAlpha(baseIdx, 0, 2) + FfxDofGetIntermediateNearAlpha(baseIdx, 1, 2) + FfxDofGetIntermediateNearAlpha(baseIdx, 2, 2));

	FfxUInt32 idx = baseIdx + FFX_DOF_COMBINE_ROW_PITCH + 1;
	if (FfxDofGetIntermediateNearAlpha(idx, 0, 0) < 0.01)
	{
		// center has zero weight, grab one of the corner colors
		FfxUInt32 maxIdx = baseIdx;
		if (FfxDofGetLDSNearLuma(baseIdx, 2, 0) > FfxDofGetLDSNearLuma(maxIdx, 0, 0)) maxIdx = baseIdx + 2;
		if (FfxDofGetLDSNearLuma(baseIdx, 0, 2) > FfxDofGetLDSNearLuma(maxIdx, 0, 0)) maxIdx = baseIdx + 2 * FFX_DOF_COMBINE_ROW_PITCH;
		if (FfxDofGetLDSNearLuma(baseIdx, 2, 2) > FfxDofGetLDSNearLuma(maxIdx, 0, 0)) maxIdx = baseIdx + 2 * FFX_DOF_COMBINE_ROW_PITCH + 2;
		idx = maxIdx;
	}
	FfxHalfOpt3 col = FfxDofGetIntermediateNearColor(idx).rgb;
	FfxHalfOpt lumaFactor = med_luma > FfxHalfOpt(0) ? clamp(med_luma / FfxDofGetLDSNearLuma(idx, 0, 0), FfxHalfOpt(0), FfxHalfOpt(2)) : FfxHalfOpt(1.0);
	return FfxHalfOpt4(col.rgb * lumaFactor, avg_alpha);
}

FfxFloat32x2 FfxDofGetTileRadius(FfxUInt32x2 group)
{
	// need to read 4 values
	FfxUInt32x2 tile = group * 2;
	FfxFloat32x2 a = FfxDofLoadDilatedRadius(tile);
	FfxFloat32x2 b = FfxDofLoadDilatedRadius(tile + FfxUInt32x2(0, 1));
	FfxFloat32x2 c = FfxDofLoadDilatedRadius(tile + FfxUInt32x2(1, 0));
	FfxFloat32x2 d = FfxDofLoadDilatedRadius(tile + FfxUInt32x2(1, 1));
	FfxFloat32 near = max(a.x, ffxMax3(b.x, c.x, d.x));
	FfxFloat32 far = min(a.y, ffxMin3(b.y, c.y, d.y));
	return FfxFloat32x2(near, far);
}

void FfxDofCombineSharpOnly(FfxUInt32x2 group, FfxUInt32x2 thread)
{
#if !defined(FFX_DOF_OPTION_COMBINE_IN_PLACE) || !FFX_DOF_OPTION_COMBINE_IN_PLACE
	FfxUInt32x2 base = 16 * group;
	FfxDofStoreOutput(base + thread + FfxUInt32x2(0, 0), FfxDofLoadFullInput(base + thread + FfxUInt32x2(0, 0)));
	FfxDofStoreOutput(base + thread + FfxUInt32x2(8, 0), FfxDofLoadFullInput(base + thread + FfxUInt32x2(8, 0)));
	FfxDofStoreOutput(base + thread + FfxUInt32x2(0, 8), FfxDofLoadFullInput(base + thread + FfxUInt32x2(0, 8)));
	FfxDofStoreOutput(base + thread + FfxUInt32x2(8, 8), FfxDofLoadFullInput(base + thread + FfxUInt32x2(8, 8)));
#endif
}

void FfxDofFetchFullColor(FfxUInt32x2 gid, FfxUInt32 gix, FfxUInt32x2 imageSize)
{
	FFX_DOF_UNROLL
	for (FfxUInt32 iter = 0; iter < 6; iter++)
	{
		FfxUInt32 iFetch = (gix + iter * 64) % (18 * 18);
		FfxInt32x2 coord = FfxInt32x2(gid * 16) + FfxInt32x2(iFetch % 18 - 1, iFetch / 18 - 1);
		coord = clamp(coord, FfxInt32x2(0, 0), FfxInt32x2(imageSize) - FfxInt32x2(1, 1));
		FfxHalfOpt3 color = FfxHalfOpt3(FfxDofLoadFullInput(coord).rgb);
		FfxDofSetIntFullColor(iFetch, color);
	}
}

void FfxDofSwizQuad(inout FfxUInt32x2 a, inout FfxUInt32x2 b, inout FfxUInt32x2 c, inout FfxUInt32x2 d)
{
	// Input: four color values in a quad.
	// Re-orders the output to a swizzled format for better store throughput.
	// This maps from one quad per lane, stored in four separate registers to
	// four 16x2 regions (one per register).
	// This is done in two steps. First, permute the values among the lanes
	// using WaveReadLaneAt. Second, swap values between registers.

	// This only works for lane counts >= 32, do nothing otherwise for compatibility
#if FFX_HLSL
	if (WaveGetLaneCount() < 32) return;

	FfxUInt32 lane = WaveGetLaneIndex();
	// index for A, switch bits around 43210 -> 10432.
	FfxUInt32 idxA = ((lane & 3) << 3) + (lane >> 2);
	// Adding 8/16/24 for B/C/D makes each variable offset from the previous by one slot.
	a = WaveReadLaneAt(a, (lane & ~31) + (idxA + 0) % 32);
	b = WaveReadLaneAt(b, (lane & ~31) + (idxA + 8) % 32);
	c = WaveReadLaneAt(c, (lane & ~31) + (idxA + 16) % 32);
	d = WaveReadLaneAt(d, (lane & ~31) + (idxA + 24) % 32);
#elif FFX_GLSL
	if (gl_SubgroupSize < 32) return;

	FfxUInt32 lane = gl_SubgroupInvocationID;
	FfxUInt32 idxA = ((lane & 3) << 3) + (lane >> 2);
	a = subgroupShuffle(a, (lane & ~31) + (idxA + 0) % 32);
	b = subgroupShuffle(b, (lane & ~31) + (idxA + 8) % 32);
	c = subgroupShuffle(c, (lane & ~31) + (idxA + 16) % 32);
	d = subgroupShuffle(d, (lane & ~31) + (idxA + 24) % 32);
#endif

	// Now, for each lane, a/b/c/d contain one value from each of the four 16x2 lines.
	// And each group of 4 lanes have values from the same quads.
	// We just need to shuffle between abcd, so that each set of 4 lanes contains one quad per variable.
	// General idea: rotate by (lane % 4) variables.
	if ((lane & 1) != 0)
	{
		// rotate A->B->C->D->A
		FfxUInt32x2 tmp = d;
		d = c;
		c = b;
		b = a;
		a = tmp;
	}
	if ((lane & 2) != 0)
	{
		// swap A<->C and B<->D
		FfxUInt32x2 tmp = a;
		a = c;
		c = tmp;
		tmp = b;
		b = d;
		d = tmp;
	}
}

#if FFX_HALF
void FfxDofSwizQuad(inout FfxFloat16x4 a, inout FfxFloat16x4 b, inout FfxFloat16x4 c, inout FfxFloat16x4 d)
{
	// Same as above.
	FfxUInt32x2 packed_a = FFX_FLOAT16X4_TO_UINT32X2(a);
	FfxUInt32x2 packed_b = FFX_FLOAT16X4_TO_UINT32X2(b);
	FfxUInt32x2 packed_c = FFX_FLOAT16X4_TO_UINT32X2(c);
	FfxUInt32x2 packed_d = FFX_FLOAT16X4_TO_UINT32X2(d);
	FfxDofSwizQuad(packed_a, packed_b, packed_c, packed_d);
	a = FFX_UINT32X2_TO_FLOAT16X4(packed_a);
	b = FFX_UINT32X2_TO_FLOAT16X4(packed_b);
	c = FFX_UINT32X2_TO_FLOAT16X4(packed_c);
	d = FFX_UINT32X2_TO_FLOAT16X4(packed_d);
}
#else // #if FFX_HALF
void FfxDofSwizQuad(inout FfxFloat32x4 a, inout FfxFloat32x4 b, inout FfxFloat32x4 c, inout FfxFloat32x4 d)
{
	// Same as above.
	FfxUInt32x2 a0 = ffxAsUInt32(a.xy);
	FfxUInt32x2 a1 = ffxAsUInt32(a.zw);
	FfxUInt32x2 b0 = ffxAsUInt32(b.xy);
	FfxUInt32x2 b1 = ffxAsUInt32(b.zw);
	FfxUInt32x2 c0 = ffxAsUInt32(c.xy);
	FfxUInt32x2 c1 = ffxAsUInt32(c.zw);
	FfxUInt32x2 d0 = ffxAsUInt32(d.xy);
	FfxUInt32x2 d1 = ffxAsUInt32(d.zw);
	FfxDofSwizQuad(a0, b0, c0, d0);
	FfxDofSwizQuad(a1, b1, c1, d1);
	a = FfxFloat32x4(ffxAsFloat(a0), ffxAsFloat(a1));
	b = FfxFloat32x4(ffxAsFloat(b0), ffxAsFloat(b1));
	c = FfxFloat32x4(ffxAsFloat(c0), ffxAsFloat(c1));
	d = FfxFloat32x4(ffxAsFloat(d0), ffxAsFloat(d1));
}
#endif // #if FFX_HALF #else

void FfxDofCombineFarOnly(FfxUInt32x2 id, FfxUInt32x2 gtID, FfxUInt32x2 gid, FfxUInt32 gix, FfxUInt32x2 imageSize)
{
	// TODO: Is this the best configuration for fetching?
	FFX_DOF_UNROLL_N(2)
	for (FfxUInt32 iter = 0; iter < 2; iter++)
	{
		// HACK: with the modulo, we will re-fetch some pixels, but might be better than waiting twice (latency-wise)
		// which is what the compiler would do if it had to possibly branch
		FfxUInt32 iFetch = (gix + iter * 64) % FFX_DOF_COMBINE_AREA;
		FfxInt32x2 coord = FfxInt32x2(gid * FFX_DOF_COMBINE_TILE_SIZE) + FfxInt32x2(iFetch % FFX_DOF_COMBINE_ROW_PITCH - 1, iFetch / FFX_DOF_COMBINE_ROW_PITCH - 1);
		coord = clamp(coord, FfxInt32x2(0, 0), FfxInt32x2(imageSize - 1));
		FfxHalfOpt4 ffColor = FfxHalfOpt4(FfxDofLoadFar(coord));

		// calculate and store luma for later median calculation
		FfxHalfOpt ffLuma = FfxHalfOpt(0.2126) * ffColor.r + FfxHalfOpt(0.7152) * ffColor.g + FfxHalfOpt(0.0722) * ffColor.b;
		FfxDofSetIntFarLuma(iFetch, ffLuma);
		FfxDofSetIntermediateFarColor(iFetch, ffColor);
	}

	FFX_GROUP_MEMORY_BARRIER;

	const FfxUInt32 baseIdx = gtID.x + gtID.y * FFX_DOF_COMBINE_ROW_PITCH;
	// one extra round of filtering needs to be done around the edge, this index maps to that.
	// TODO: This is ugly and possibly slow
	const FfxUInt32 baseIdx2 = (FFX_DOF_COMBINE_TILE_SIZE + FFX_DOF_COMBINE_ROW_PITCH * gix + (gix / (FFX_DOF_COMBINE_TILE_SIZE + 1)) * ((gix - FFX_DOF_COMBINE_TILE_SIZE) * (-FFX_DOF_COMBINE_ROW_PITCH + 1) - (FFX_DOF_COMBINE_TILE_SIZE + 1))) % FFX_DOF_COMBINE_AREA;

	FfxHalfOpt4 ffColor = FfxHalfOpt4(0, 0, 0, 0), ffColor2 = FfxHalfOpt4(0, 0, 0, 0);
	// far-field post-filter
	ffColor = FfxDofFilterFF(baseIdx);
	ffColor2 = gix < (2 * FFX_DOF_COMBINE_TILE_SIZE + 1) ? FfxDofFilterFF(baseIdx2) : FfxHalfOpt4(0, 0, 0, 0);

	FFX_GROUP_MEMORY_BARRIER;

	// write out colors for interpolation
	FfxDofSetIntermediateFarColor(baseIdx, ffColor);
	if (gix < (2 * FFX_DOF_COMBINE_TILE_SIZE + 1))
	{
		FfxDofSetIntermediateFarColor(baseIdx2, ffColor2);
	}

	FFX_GROUP_MEMORY_BARRIER;

	// upscaling
	FfxHalfOpt4 ffTR = FfxHalfOpt4(0, 0, 0, 0), ffBL = FfxHalfOpt4(0, 0, 0, 0), ffBR = FfxHalfOpt4(0, 0, 0, 0);
	ffTR = FfxHalfOpt(0.5) * ffColor + FfxHalfOpt(0.5) * FfxDofGetIntermediateFarColor(baseIdx + 1);
	ffBL = FfxHalfOpt(0.5) * ffColor + FfxHalfOpt(0.5) * FfxDofGetIntermediateFarColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH);
	ffBR = FfxHalfOpt(0.5) * ffTR + FfxHalfOpt(0.25) * FfxDofGetIntermediateFarColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH) + FfxHalfOpt(0.25) * FfxDofGetIntermediateFarColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH + 1);

	// top-left pixel
	FfxUInt32x2 coord = 2 * id;
	FfxUInt32x2 relCoord = 2 * gtID;
	FfxUInt32x2 coordA = coord;
	FfxHalfOpt4 colA = FfxDofFinalCombineColors(coord, relCoord, ffColor, FfxHalfOpt4(0, 0, 0, 0), FfxHalfOpt(0));
	// top-right
	coord.x++;
	relCoord.x++;
	FfxUInt32x2 coordB = coord;
	FfxHalfOpt4 colB = FfxDofFinalCombineColors(coord, relCoord, ffTR, FfxHalfOpt4(0, 0, 0, 0), FfxHalfOpt(0));
	// bottom-right
	coord.y++;
	relCoord.y++;
	FfxUInt32x2 coordC = coord;
	FfxHalfOpt4 colC = FfxDofFinalCombineColors(coord, relCoord, ffBR, FfxHalfOpt4(0, 0, 0, 0), FfxHalfOpt(0));
	// bottom-left
	coord.x--;
	relCoord.x--;
	FfxUInt32x2 coordD = coord;
	FfxHalfOpt4 colD = FfxDofFinalCombineColors(coord, relCoord, ffBL, FfxHalfOpt4(0, 0, 0, 0), FfxHalfOpt(0));

    // TODO: Navi3 should make swizzling unnecessary because it supports write-combining clauses
	FfxDofSwizQuad(colA, colB, colC, colD);
	FfxDofSwizQuad(coordA, coordB, coordC, coordD);

	FfxDofStoreOutput(coordA, colA);
	FfxDofStoreOutput(coordB, colB);
	FfxDofStoreOutput(coordC, colC);
	FfxDofStoreOutput(coordD, colD);
}

void FfxDofCombineAll(FfxUInt32x2 id, FfxUInt32x2 gtID, FfxUInt32x2 gid, FfxUInt32 gix, FfxUInt32x2 imageSize)
{
	// TODO: Is this the best configuration for fetching?
	FFX_DOF_UNROLL_N(2)
	for (FfxUInt32 iter = 0; iter < 2; iter++)
	{
		// HACK: with the modulo, we will re-fetch some pixels, but might be better than waiting twice (latency-wise)
		// which is what the compiler would do if it had to possibly branch
		FfxUInt32 iFetch = (gix + iter * 64) % FFX_DOF_COMBINE_AREA;
		FfxInt32x2 coord = FfxInt32x2(gid * FFX_DOF_COMBINE_TILE_SIZE) + FfxInt32x2(iFetch % FFX_DOF_COMBINE_ROW_PITCH - 1, iFetch / FFX_DOF_COMBINE_ROW_PITCH - 1);
		coord = clamp(coord, FfxInt32x2(0, 0), FfxInt32x2(imageSize - 1));
		FfxHalfOpt4 ffColor = FfxHalfOpt4(FfxDofLoadFar(coord));
		FfxHalfOpt4 nfColor = FfxHalfOpt4(FfxDofLoadNear(coord));

		// calculate and store luma for later median calculation
		FfxHalfOpt ffLuma = FfxHalfOpt(0.2126) * ffColor.r + FfxHalfOpt(0.7152) * ffColor.g + FfxHalfOpt(0.0722) * ffColor.b;
		FfxHalfOpt nfLuma = FfxHalfOpt(0.2126) * nfColor.r + FfxHalfOpt(0.7152) * nfColor.g + FfxHalfOpt(0.0722) * nfColor.b;
		FfxDofSetIntFarLuma(iFetch, ffLuma);
		FfxDofSetIntNearLuma(iFetch, nfLuma);
		FfxDofSetIntermediateFarColor(iFetch, ffColor);
		FfxDofSetIntermediateNearColor(iFetch, nfColor);
	}

	FFX_GROUP_MEMORY_BARRIER;

	const FfxUInt32 baseIdx = gtID.x + gtID.y * FFX_DOF_COMBINE_ROW_PITCH;
	// one extra round of filtering needs to be done around the edge, this index maps to that.
	// TODO: same as above, ugly and slow.
	const FfxUInt32 baseIdx2 = (FFX_DOF_COMBINE_TILE_SIZE + FFX_DOF_COMBINE_ROW_PITCH * gix + (gix / (FFX_DOF_COMBINE_TILE_SIZE + 1)) * ((gix - FFX_DOF_COMBINE_TILE_SIZE) * (-FFX_DOF_COMBINE_ROW_PITCH + 1) - (FFX_DOF_COMBINE_TILE_SIZE + 1))) % FFX_DOF_COMBINE_AREA;

	FfxHalfOpt4 ffColor = FfxHalfOpt4(0, 0, 0, 0), ffColor2 = FfxHalfOpt4(0, 0, 0, 0), nfColor = FfxHalfOpt4(0, 0, 0, 0), nfColor2 = FfxHalfOpt4(0, 0, 0, 0);
	// far-field post-filter
	ffColor = FfxDofFilterFF(baseIdx);
	ffColor2 = gix < (2 * FFX_DOF_COMBINE_TILE_SIZE + 1) ? FfxDofFilterFF(baseIdx2) : FfxHalfOpt4(0, 0, 0, 0);

	// near-field post-filter
	nfColor = FfxDofFilterNF(baseIdx);
	nfColor2 = gix < (2 * FFX_DOF_COMBINE_TILE_SIZE + 1) ? FfxDofFilterNF(baseIdx2) : FfxHalfOpt4(0, 0, 0, 0);

	FFX_GROUP_MEMORY_BARRIER;

	// write out colors for interpolation
	FfxDofSetIntermediateNearColor(baseIdx, nfColor);
	FfxDofSetIntermediateFarColor(baseIdx, ffColor);
	if (gix < (2 * FFX_DOF_COMBINE_TILE_SIZE + 1))
	{
		FfxDofSetIntermediateNearColor(baseIdx2, nfColor2);
		FfxDofSetIntermediateFarColor(baseIdx2, ffColor2);
	}

	FFX_GROUP_MEMORY_BARRIER;

	// if any FG sample has zero weight, the interpolation is invalid.
	// take the min and invalidate if zero (see CombineColors)
	FfxHalfOpt fgMinW = min(nfColor.a, FfxHalfOpt(ffxMin3(FfxDofGetIntermediateNearAlpha(baseIdx, 1, 0), FfxDofGetIntermediateNearAlpha(baseIdx, 0, 1), FfxDofGetIntermediateNearAlpha(baseIdx, 1, 1))));

	// upscaling
	FfxHalfOpt4 nfTR = FfxHalfOpt4(0, 0, 0, 0), nfBL = FfxHalfOpt4(0, 0, 0, 0), nfBR = FfxHalfOpt4(0, 0, 0, 0);
	nfTR = FfxHalfOpt(0.5) * nfColor + FfxHalfOpt(0.5) * FfxDofGetIntermediateNearColor(baseIdx + 1);
	nfBL = FfxHalfOpt(0.5) * nfColor + FfxHalfOpt(0.5) * FfxDofGetIntermediateNearColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH);
	nfBR = FfxHalfOpt(0.5) * nfTR + FfxHalfOpt(0.25) * FfxDofGetIntermediateNearColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH) + FfxHalfOpt(0.25) * FfxDofGetIntermediateNearColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH + 1);

	FfxHalfOpt4 ffTR = FfxHalfOpt4(0, 0, 0, 0), ffBL = FfxHalfOpt4(0, 0, 0, 0), ffBR = FfxHalfOpt4(0, 0, 0, 0);
	ffTR = FfxHalfOpt(0.5) * ffColor + FfxHalfOpt(0.5) * FfxDofGetIntermediateFarColor(baseIdx + 1);
	ffBL = FfxHalfOpt(0.5) * ffColor + FfxHalfOpt(0.5) * FfxDofGetIntermediateFarColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH);
	ffBR = FfxHalfOpt(0.5) * ffTR + FfxHalfOpt(0.25) * FfxDofGetIntermediateFarColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH) + FfxHalfOpt(0.25) * FfxDofGetIntermediateFarColor(baseIdx + FFX_DOF_COMBINE_ROW_PITCH + 1);

	// top-left pixel
	FfxUInt32x2 coord = 2 * id;
	FfxUInt32x2 relCoord = 2 * gtID;
	FfxUInt32x2 coordA = coord;
	FfxHalfOpt4 colA = FfxDofFinalCombineColors(coord, relCoord, ffColor, nfColor, fgMinW);
	// top-right
	coord.x += 1;
	relCoord.x += 1;
	FfxUInt32x2 coordB = coord;
	FfxHalfOpt4 colB = FfxDofFinalCombineColors(coord, relCoord, ffTR, nfTR, fgMinW);
	// bottom-right
	coord.y++;
	relCoord.y++;
	FfxUInt32x2 coordC = coord;
	FfxHalfOpt4 colC = FfxDofFinalCombineColors(coord, relCoord, ffBR, nfBR, fgMinW);
	// bottom-left
	coord.x--;
	relCoord.x--;
	FfxUInt32x2 coordD = coord;
	FfxHalfOpt4 colD = FfxDofFinalCombineColors(coord, relCoord, ffBL, nfBL, fgMinW);

	FfxDofSwizQuad(colA, colB, colC, colD);
	FfxDofSwizQuad(coordA, coordB, coordC, coordD);

	FfxDofStoreOutput(coordA, colA);
	FfxDofStoreOutput(coordB, colB);
	FfxDofStoreOutput(coordC, colC);
	FfxDofStoreOutput(coordD, colD);
}

/// Entry point. Meant to run in 8x8 threads and writes 16x16 output pixels.
///
/// @param threadID SV_DispatchThreadID.xy
/// @param groupThreadID SV_GroupThreadID.xy
/// @param group SV_GroupID.xy
/// @param index SV_GroupIndex
/// @param halfImageSize Pixel size of the input (half resolution)
/// @param fullImageSize Pixel size of the output (full resolution)
/// @ingroup FfxGPUDof
void FfxDofCombineHalfRes(FfxUInt32x2 threadID, FfxUInt32x2 groupThreadID, FfxUInt32x2 group, FfxUInt32 index, FfxUInt32x2 halfImageSize, FfxUInt32x2 fullImageSize)
{
	// classify tile
	FfxFloat32x2 tileCoc = FfxDofGetTileRadius(group);
	FfxBoolean nearNeeded = tileCoc.x > -1.025; // halved due to resolution change, then: 2px = threshold in main pass + small inaccuracy bias
	FfxBoolean allSharp = max(abs(tileCoc.x), abs(tileCoc.y)) < 0.25;

	if (allSharp)
	{
		FfxDofCombineSharpOnly(group, groupThreadID);
	}
	else if (!nearNeeded)
	{
		FfxDofFetchFullColor(group, index, fullImageSize);
		FfxDofCombineFarOnly(threadID, groupThreadID, group, index, halfImageSize);
	}
	else
	{
		FfxDofFetchFullColor(group, index, fullImageSize);
		FfxDofCombineAll(threadID, groupThreadID, group, index, halfImageSize);
	}
}