From 6d05ab865a02aa9b6c405c86eb19bfc74f047965 Mon Sep 17 00:00:00 2001
From: ecker <mrq@ecker.tech>
Date: Tue, 21 Apr 2026 20:49:47 -0500
Subject: [PATCH] neurotic optimizations (overhauled depth pyramid and bloom to
 ffx-sdp), more fixes

---
 bin/data/config.json                          |    5 +-
 .../scenes/sourceengine/sourceengine.json     |    4 +-
 bin/data/shaders/common/functions.h           |    3 +-
 bin/data/shaders/display/bloom/comp.glsl      |   65 -
 bin/data/shaders/display/bloom/down.comp.glsl |  131 ++
 bin/data/shaders/display/bloom/up.comp.glsl   |   60 +
 bin/data/shaders/display/deferred/comp/comp.h |    4 +-
 .../shaders/display/depth-pyramid/comp.glsl   |  103 +-
 bin/data/shaders/ext/ffx_a.h                  | 1907 +++++++++++++++++
 bin/data/shaders/ext/ffx_spd.h                | 1297 +++++++++++
 bin/data/shaders/graph/cull/comp.glsl         |  190 +-
 bin/data/shaders/raytrace/shader.ray-gen.glsl |    2 +-
 engine/inc/uf/engine/graph/graph.h            |    2 +
 engine/inc/uf/ext/vulkan/device.h             |    3 +
 engine/inc/uf/ext/vulkan/graphic.h            |    4 +-
 engine/inc/uf/ext/vulkan/texture.h            |    2 +-
 engine/inc/uf/ext/vulkan/vk.h                 |    2 +-
 engine/inc/uf/ext/vulkan/vulkan.h             |    4 -
 engine/src/engine/ext/scene/behavior.cpp      |   12 +-
 engine/src/engine/graph/graph.cpp             |    6 +-
 engine/src/ext/vulkan/graphic.cpp             |   33 +-
 engine/src/ext/vulkan/rendermodes/base.cpp    |   15 +-
 .../src/ext/vulkan/rendermodes/deferred.cpp   |  444 ++--
 .../src/ext/vulkan/rendermodes/transition.inl |   58 +-
 engine/src/ext/vulkan/shader.cpp              |   37 +-
 engine/src/ext/vulkan/texture.cpp             |   10 +-
 engine/src/ext/vulkan/vulkan.cpp              |   17 +-
 27 files changed, 3986 insertions(+), 434 deletions(-)
 delete mode 100644 bin/data/shaders/display/bloom/comp.glsl
 create mode 100644 bin/data/shaders/display/bloom/down.comp.glsl
 create mode 100644 bin/data/shaders/display/bloom/up.comp.glsl
 create mode 100644 bin/data/shaders/ext/ffx_a.h
 create mode 100644 bin/data/shaders/ext/ffx_spd.h

diff --git a/bin/data/config.json b/bin/data/config.json
index 950847d9..da51c7a9 100644
--- a/bin/data/config.json
+++ b/bin/data/config.json
@@ -110,7 +110,7 @@
 					"default stage buffers": true,
 					"default defer buffer destroy": true,
 					"default command buffer immediate": true,
-					"multithreaded recording": true
+					"multithreaded recording": false
 				},
 				"pipelines": {
 					"deferred": true,
@@ -147,7 +147,8 @@
 							"deviceCoherentMemory",
 							"robustBufferAccess",
 							"samplerAnisotropy",
-							"sampleRateShading"
+							"sampleRateShading",
+							"samplerFilterMinmax"
 						],
 						"featureChain": []
 					},
diff --git a/bin/data/scenes/sourceengine/sourceengine.json b/bin/data/scenes/sourceengine/sourceengine.json
index feac6dec..95a20fe6 100644
--- a/bin/data/scenes/sourceengine/sourceengine.json
+++ b/bin/data/scenes/sourceengine/sourceengine.json
@@ -1,9 +1,9 @@
 {
 //	"import": "./rp_downtown_v2.json"
-	"import": "./ss2_medsci1.json"
+//	"import": "./ss2_medsci1.json"
 //	"import": "./test_grid.json"
 //	"import": "./sh2_mcdonalds.json"
 //	"import": "./animal_crossing.json"
-//	"import": "./mds_mcdonalds.json"
+	"import": "./mds_mcdonalds.json"
 //	"import": "./gm_construct.json"
 }
\ No newline at end of file
diff --git a/bin/data/shaders/common/functions.h b/bin/data/shaders/common/functions.h
index 95ce4718..e626031a 100644
--- a/bin/data/shaders/common/functions.h
+++ b/bin/data/shaders/common/functions.h
@@ -44,6 +44,7 @@ void gammaCorrect( inout vec3 color, float gamma ) {
 }
 void toneMap( inout vec4 color, float exposure ) { toneMap(color.rgb, exposure); }
 void gammaCorrect( inout vec4 color, float gamma ) { gammaCorrect(color.rgb, gamma); }
+float luma( vec3 color ) { return dot(color, vec3(0.2126, 0.7152, 0.0722)); }
 //
 uint tea(uint val0, uint val1) {
 	uint v0 = val0;
@@ -152,6 +153,7 @@ vec3 decodeSrgb(vec3 rgb) {
 	const vec3 c = step(vec3(0.04045), rgb);
 	return mix(a, b, c);
 }
+#if !SPD && (DEFERRED || FRAGMENT || COMPUTE || RT)
 bool validTextureIndex( int textureIndex ) {
 	return 0 <= textureIndex && textureIndex < MAX_TEXTURES;
 }
@@ -160,7 +162,6 @@ bool validCubemapIndex( int textureIndex ) {
 	return 0 <= textureIndex && textureIndex < MAX_CUBEMAPS;
 }
 #endif
-#if !BLOOM && (DEFERRED || FRAGMENT || COMPUTE || RT)
 bool validTextureIndex( uint id ) {
 	return 0 <= id && id < MAX_TEXTURES;
 }
diff --git a/bin/data/shaders/display/bloom/comp.glsl b/bin/data/shaders/display/bloom/comp.glsl
deleted file mode 100644
index 058764f3..00000000
--- a/bin/data/shaders/display/bloom/comp.glsl
+++ /dev/null
@@ -1,65 +0,0 @@
-#version 450
-#pragma shader_stage(compute)
-
-#define COMPUTE 1
-#define TEXTURES 0
-#define CUBEMAPS 0
-#define BLOOM 1
-
-layout (local_size_x = 16, local_size_y = 16, local_size_z = 1) in;
-
-layout( push_constant ) uniform PushBlock {
-  uint eye;
-  uint mode;
-} PushConstant;
-
-layout (binding = 0) uniform UBO {
-	float threshold;
-	float smoothness;
-	uint size;
-	float padding1;
-
-	float weights[32];
-} ubo;
-
-layout (binding = 1, rgba16f) uniform image2D imageColor;
-layout (binding = 2, rgba16f) uniform image2D imageBloom;
-layout (binding = 3, rgba16f) uniform image2D imagePingPong;
-
-#include "../../common/macros.h"
-#include "../../common/structs.h"
-#include "../../common/functions.h"
-
-void main() {
-	const uint mode = PushConstant.mode;
-	const ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
-	const ivec2 size = imageSize( imageColor );
-	if ( texel.x >= size.x || texel.y >= size.y ) return;
-	
-	if ( mode == 0 ) { // fill bloom
-		vec3 result = imageLoad( imageColor, texel ).rgb;
-		float brightness = dot(result, vec3(0.2126, 0.7152, 0.0722));
-		if( brightness < ubo.threshold ) result = vec3(0.0);
-		imageStore(imageBloom, texel, vec4(result, 1.0));
-	} else if ( mode == 1 ) { // bloom horizontal
-		vec3 result = imageLoad( imageBloom, texel ).rgb * ubo.weights[0];
-    for ( int i = 1; i < int(ubo.size); ++i ) {
-			vec3 c1 = imageLoad( imageBloom, texel + ivec2(i, 0) ).rgb;
-			vec3 c2 = imageLoad( imageBloom, texel - ivec2(i, 0) ).rgb;
-			result += (c1 + c2) * ubo.weights[i];
-    }
-    imageStore( imagePingPong, texel, vec4(result, 1.0) );
-	} else if ( mode == 2 ) { // bloom vertical
-		vec3 result = imageLoad( imagePingPong, texel ).rgb * ubo.weights[0];
-		for( int i = 1; i < int(ubo.size); ++i ) {
-			vec3 c1 = imageLoad( imagePingPong, texel + ivec2(0, i) ).rgb;
-			vec3 c2 = imageLoad( imagePingPong, texel - ivec2(0, i) ).rgb;
-			result += (c1 + c2) * ubo.weights[i];
-		}
-		imageStore(imageBloom, texel, vec4(result, 1.0));
-	} else if ( mode == 3 ) { // combine
-		vec3 base = imageLoad( imageColor, texel ).rgb;
-		vec3 bloom = imageLoad( imageBloom, texel ).rgb;
-		imageStore( imageColor, texel, vec4(base + bloom, 1.0) );
-	}
-}
\ No newline at end of file
diff --git a/bin/data/shaders/display/bloom/down.comp.glsl b/bin/data/shaders/display/bloom/down.comp.glsl
new file mode 100644
index 00000000..02f350c3
--- /dev/null
+++ b/bin/data/shaders/display/bloom/down.comp.glsl
@@ -0,0 +1,131 @@
+#version 450
+#pragma shader_stage(compute)
+
+#extension GL_KHR_shader_subgroup_quad : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_samplerless_texture_functions : enable
+
+#define COMPUTE 1
+#define SPD 1
+
+#include "../../common/macros.h"
+#include "../../common/structs.h"
+#include "../../common/functions.h"
+
+layout (local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (constant_id = 0) const uint MIPS = 6;
+
+layout(push_constant) uniform PushBlock {
+	uint mips;
+	uint numWorkGroups;
+	uint workGroupOffset;
+} PushConstant_;
+
+
+layout (binding = 0, rgba16f) uniform image2D imageColor;
+layout (binding = 1, rgba16f) uniform image2D imageBright; // yucky
+layout (binding = 2, rgba16f) coherent uniform image2D outImage[MIPS];
+
+layout (binding = 3, std430) buffer AtomicCounter {
+	uint counter;
+} spdCounter;
+
+layout (binding = 4) uniform UBO {
+	float threshold;
+	float smoothness;
+	uint size;
+	float padding1;
+
+	float weights[32];
+} ubo;
+
+#define A_GLSL 1
+#define A_GPU 1
+#define SPD_NO_WAVE_OPERATIONS 0
+#include "../../ext/ffx_a.h"
+
+shared AU1 spd_counter;
+shared AF4 spd_intermediate[16][16];
+
+vec3 applySoftKnee(vec3 color, float luminance) {
+	float rq = clamp(luminance - ubo.threshold + ubo.smoothness, 0.0, 2.0 * ubo.smoothness);
+	rq = (rq * rq) / (4.0 * ubo.smoothness + 0.0001);
+
+	float value = max(rq, luminance - ubo.threshold);
+
+	return color * (value / (max(luminance, 0.0001)));
+}
+
+AF4 SpdLoadSourceImage(ASU2 p, AU1 slice) {
+	ivec2 size = imageSize(imageColor);
+
+	// sample color if in bound, else black
+	vec3 c0 = p.x < size.x && p.y < size.y ? imageLoad(imageColor, p + ivec2(0, 0)).rgb : vec3(0.0);
+	vec3 c1 = p.x + 1 < size.x && p.y < size.y ? imageLoad(imageColor, p + ivec2(1, 0)).rgb : vec3(0.0);
+	vec3 c2 = p.x < size.x && p.y + 1 < size.y ? imageLoad(imageColor, p + ivec2(0, 1)).rgb : vec3(0.0);
+	vec3 c3 = p.x + 1 < size.x && p.y + 1 < size.y ? imageLoad(imageColor, p + ivec2(1, 1)).rgb : vec3(0.0);
+
+	// get luma
+	float b0 = luma(c0);
+	float b1 = luma(c1);
+	float b2 = luma(c2);
+	float b3 = luma(c3);
+
+    // soften
+	c0 = applySoftKnee(c0, b0);
+	c1 = applySoftKnee(c1, b1);
+	c2 = applySoftKnee(c2, b2);
+	c3 = applySoftKnee(c3, b3);
+
+	// karis luma weighted average
+	float w0 = 1.0 / (b0 + 1.0);
+	float w1 = 1.0 / (b1 + 1.0);
+	float w2 = 1.0 / (b2 + 1.0);
+	float w3 = 1.0 / (b3 + 1.0);
+	float inv_wsum = 1.0 / (w0 + w1 + w2 + w3);
+
+	// store to mip 0
+	if (p.x < size.x && p.y < size.y) imageStore(outImage[0], p + ivec2(0, 0), vec4(c0, 1.0));
+	if (p.x + 1 < size.x && p.y < size.y) imageStore(outImage[0], p + ivec2(1, 0), vec4(c1, 1.0));
+	if (p.x < size.x && p.y + 1 < size.y) imageStore(outImage[0], p + ivec2(0, 1), vec4(c2, 1.0));
+	if (p.x + 1 < size.x && p.y + 1 < size.y) imageStore(outImage[0], p + ivec2(1, 1), vec4(c3, 1.0));
+
+    // average
+	return AF4((c0 * w0 + c1 * w1 + c2 * w2 + c3 * w3) * inv_wsum, 1.0);
+}
+
+AF4 SpdLoad(ASU2 p, AU1 slice) {
+	uint loadMip = min(6u - 1, MIPS - 1);
+	return imageLoad(outImage[loadMip + 1], p);
+}
+
+void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice) {
+	if ( mip + 1 < MIPS ) {
+		imageStore(outImage[mip + 1], p, value);
+	}
+}
+
+// average filter
+AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3) {
+	return (v0 + v1 + v2 + v3) * 0.25;
+}
+
+AF4 SpdLoadIntermediate(AU1 x, AU1 y) { return spd_intermediate[x][y]; }
+void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value) { spd_intermediate[x][y] = value; }
+
+void SpdIncreaseAtomicCounter(AU1 slice) { spd_counter = atomicAdd(spdCounter.counter, 1); }
+AU1 SpdGetAtomicCounter() { return spd_counter; }
+void SpdResetAtomicCounter(AU1 slice) { spdCounter.counter = 0; }
+
+#include "../../ext/ffx_spd.h"
+
+void main() {
+	SpdDownsample(
+		AU2(gl_WorkGroupID.xy),
+        AU1(gl_LocalInvocationIndex),
+        AU1(PushConstant_.mips - 1),
+		AU1(PushConstant_.numWorkGroups),
+        AU1(PushConstant_.workGroupOffset)
+	);
+}
\ No newline at end of file
diff --git a/bin/data/shaders/display/bloom/up.comp.glsl b/bin/data/shaders/display/bloom/up.comp.glsl
new file mode 100644
index 00000000..057fc76f
--- /dev/null
+++ b/bin/data/shaders/display/bloom/up.comp.glsl
@@ -0,0 +1,60 @@
+#version 450
+#pragma shader_stage(compute)
+
+#define COMPUTE 1
+
+layout (local_size_x = 16, local_size_y = 16, local_size_z = 1) in;
+
+layout (constant_id = 0) const uint MIPS = 6;
+
+layout (binding = 0, rgba16f) uniform image2D imageColor;
+layout (binding = 1) uniform sampler2D samplerBloom;
+
+layout (binding = 2) uniform UBO {
+	float threshold;
+	float smoothness;
+	uint size;
+	float padding1;
+
+	float weights[32];
+} ubo;
+
+// 9-tap bilinear tent filter
+vec3 tentFilter(sampler2D tex, vec2 uv, float lod) {
+	vec2 texSize = vec2(textureSize(tex, int(lod)));
+	vec4 d = (1.0 / texSize.xyxy) * vec4(1.0, 1.0, -1.0, 0.0);
+
+	vec3 s = textureLod(tex, uv - d.xy, lod).rgb;
+	s += textureLod(tex, uv - d.wy, lod).rgb * 2.0;
+	s += textureLod(tex, uv - d.zy, lod).rgb;
+	s += textureLod(tex, uv + d.zw, lod).rgb * 2.0;
+	s += textureLod(tex, uv,		lod).rgb * 4.0;
+	s += textureLod(tex, uv + d.xw, lod).rgb * 2.0;
+	s += textureLod(tex, uv + d.zy, lod).rgb;
+	s += textureLod(tex, uv + d.wy, lod).rgb * 2.0;
+	s += textureLod(tex, uv + d.xy, lod).rgb;
+
+	return s * (1.0 / 16.0);
+}
+
+void main() {
+	ivec2 texel = ivec2(gl_GlobalInvocationID.xy);
+	ivec2 size = imageSize(imageColor);
+	if ( texel.x >= size.x || texel.y >= size.y ) return;
+
+	vec2 uv = (vec2(texel) + 0.5) / vec2(size);
+	vec3 bloomAcc = vec3(0.0);
+	float weightSum = 0.0;
+
+	for ( uint i = 0; i < min(MIPS, ubo.size); ++i ) {
+		float w = ubo.weights[i];
+		bloomAcc += textureLod(samplerBloom, uv, float(i)).rgb * w;
+		//bloomAcc += tentFilter(samplerBloom, uv, float(i)) * w;
+		weightSum += w;
+	}
+
+	if ( weightSum > 0.0 ) bloomAcc /= weightSum;
+
+	vec3 base = imageLoad( imageColor, texel ).rgb;
+	imageStore( imageColor, texel, vec4(base + bloomAcc, 1.0) );
+}
\ No newline at end of file
diff --git a/bin/data/shaders/display/deferred/comp/comp.h b/bin/data/shaders/display/deferred/comp/comp.h
index 9e37b8db..22e32141 100644
--- a/bin/data/shaders/display/deferred/comp/comp.h
+++ b/bin/data/shaders/display/deferred/comp/comp.h
@@ -159,7 +159,7 @@ void postProcess() {
 #if FOG
 	fog( surface.ray, surface.fragment.rgb, surface.fragment.a );
 #endif
-	float brightness = dot(surface.fragment.rgb, vec3(0.2126, 0.7152, 0.0722));
+	float brightness = luma(surface.fragment.rgb);
 	bool bloom = brightness > ubo.settings.bloom.threshold;
 //if ( bloom ) toneMap( surface.fragment.rgb, brightness );
 	vec4 outFragColor = vec4(surface.fragment.rgb, 1.0);
@@ -184,7 +184,7 @@ void postProcess() {
 	}
 
 	IMAGE_STORE( imageColor, outFragColor );
-	IMAGE_STORE( imageBright, outFragBright );
+	//IMAGE_STORE( imageBright, outFragBright );
 	IMAGE_STORE( imageMotion, vec4(outFragMotion, 0, 0) );
 }
 
diff --git a/bin/data/shaders/display/depth-pyramid/comp.glsl b/bin/data/shaders/display/depth-pyramid/comp.glsl
index efbaa5e5..2285e7df 100644
--- a/bin/data/shaders/display/depth-pyramid/comp.glsl
+++ b/bin/data/shaders/display/depth-pyramid/comp.glsl
@@ -1,35 +1,96 @@
 #version 450
 #pragma shader_stage(compute)
 
-//#extension GL_EXT_nonuniform_qualifier : enable
-
-layout (constant_id = 0) const uint MIPS = 6;
-layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
+#extension GL_KHR_shader_subgroup_quad : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_samplerless_texture_functions : enable
 
 #define COMPUTE 1
+#define SPD 1
 
 #include "../../common/macros.h"
 #include "../../common/structs.h"
+#include "../../common/functions.h"
 
-layout( push_constant ) uniform PushBlock {
-  uint _;
-  uint pass;
-} PushConstant;
+layout (local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (constant_id = 0) const uint MIPS = 6;
+
+layout(push_constant) uniform PushBlock {
+	uint mips;
+	uint numWorkGroups;
+	uint workGroupOffset;
+} PushConstant_;
 
 layout (binding = 0) uniform sampler2D samplerDepth;
-layout (binding = 1) uniform sampler2D inImage[MIPS];
-layout (binding = 2, r32f) uniform writeonly image2D outImage[MIPS];
+layout (binding = 1, r32f) coherent uniform image2D outImage[MIPS];
+
+layout (binding = 2, std430) buffer AtomicCounter {
+	uint counter;
+} spdCounter;
+
+
+#define A_GLSL 1
+#define A_GPU 1
+#define SPD_NO_WAVE_OPERATIONS 0
+#include "../../ext/ffx_a.h"
+
+shared AU1 spd_counter;
+shared AF1 spd_intermediate[16][16];
+
+AF4 SpdLoadSourceImage(ASU2 p, AU1 slice) {
+	ivec2 size = imageSize(outImage[0]);
+
+	// sample depth if in bound, else 0 (0 for reverse-z projection, use 1 if normal projection)
+	float d0 = p.x < size.x && p.y < size.y ? texelFetch(samplerDepth, p + ivec2(0, 0), 0).x : 0.0;
+	float d1 = p.x + 1 < size.x && p.y < size.y ? texelFetch(samplerDepth, p + ivec2(1, 0), 0).x : 0.0;
+	float d2 = p.x < size.x && p.y + 1 < size.y ? texelFetch(samplerDepth, p + ivec2(0, 1), 0).x : 0.0;
+	float d3 = p.x + 1 < size.x && p.y + 1 < size.y ? texelFetch(samplerDepth, p + ivec2(1, 1), 0).x : 0.0;
+
+	// store to mip 0
+	if (p.x < size.x && p.y < size.y) imageStore(outImage[0], p + ivec2(0, 0), vec4(d0));
+	if (p.x + 1 < size.x && p.y < size.y) imageStore(outImage[0], p + ivec2(1, 0), vec4(d1));
+	if (p.x < size.x && p.y + 1 < size.y) imageStore(outImage[0], p + ivec2(0, 1), vec4(d2));
+	if (p.x + 1 < size.x && p.y + 1 < size.y) imageStore(outImage[0], p + ivec2(1, 1), vec4(d3));
+
+	return AF4(d0, d1, d2, d3);
+}
+
+AF4 SpdLoad(ASU2 p, AU1 slice) {
+	uint loadMip = min(6u, MIPS - 1);
+	float d = imageLoad(outImage[loadMip], p).r;
+	return AF4(d, d, d, d);
+}
+
+void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice) {
+	if ( mip + 1 < MIPS ) {
+		imageStore(outImage[mip + 1], p, vec4(value.x));
+	}
+}
+
+AF4 SpdLoadIntermediate(AU1 x, AU1 y) {
+		float d = spd_intermediate[x][y];
+		return AF4(d, d, d, d);
+}
+void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value) { spd_intermediate[x][y] = value.x; }
+void SpdIncreaseAtomicCounter(AU1 slice) { spd_counter = atomicAdd(spdCounter.counter, 1); }
+AU1 SpdGetAtomicCounter() { return spd_counter; }
+void SpdResetAtomicCounter(AU1 slice) { spdCounter.counter = 0; }
+
+// min filter
+AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3) {
+	float minVal = min(min(v0.x, v1.x), min(v2.x, v3.x));
+	return AF4(minVal, minVal, minVal, minVal);
+}
+
+#include "../../ext/ffx_spd.h"
 
 void main() {
-	int mip = int(PushConstant.pass);
-
-	float depth;
-	ivec2 pos = ivec2(gl_GlobalInvocationID.xy);
-	if ( mip == 0 ) {
-		depth = texelFetch(samplerDepth, pos, 0).r;
-	} else {
-		depth = texture(inImage[mip - 1], (vec2(gl_GlobalInvocationID.xy) + vec2(0.5)) / imageSize( outImage[mip] )).x;
-	}
-
-	imageStore(outImage[mip], pos, vec4(depth));
+	SpdDownsample(
+		AU2(gl_WorkGroupID.xy),
+		AU1(gl_LocalInvocationIndex),
+		AU1(PushConstant_.mips - 1),
+		AU1(PushConstant_.numWorkGroups),
+		AU1(PushConstant_.workGroupOffset)
+	);
 }
\ No newline at end of file
diff --git a/bin/data/shaders/ext/ffx_a.h b/bin/data/shaders/ext/ffx_a.h
new file mode 100644
index 00000000..0a7cc01d
--- /dev/null
+++ b/bin/data/shaders/ext/ffx_a.h
@@ -0,0 +1,1907 @@
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                              [A] SHADER PORTABILITY 1.20190530
+//
+//==============================================================================================================================
+// LICENSE
+// =======
+// Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) <2014> <Michal Drobot>
+// -------
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// -------
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+// Software.
+// -------
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//------------------------------------------------------------------------------------------------------------------------------
+// ABOUT
+// =====
+// Common central point for high-level shading language and C portability for various shader headers.
+//------------------------------------------------------------------------------------------------------------------------------
+// DEFINES
+// =======
+// A_CPU ..... Include the CPU related code.
+// A_GPU ..... Include the GPU related code.
+// A_GLSL .... Using GLSL.
+// A_HLSL .... Using HLSL.
+// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default).
+// =======
+// A_BYTE .... Support 8-bit integer.
+// A_HALF .... Support 16-bit integer and floating point.
+// A_LONG .... Support 64-bit integer.
+// A_DUBL .... Support 64-bit floating point.
+// =======
+// A_WAVE .... Support wave-wide operations.
+//------------------------------------------------------------------------------------------------------------------------------
+// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'.
+//------------------------------------------------------------------------------------------------------------------------------
+// SIMPLIFIED TYPE SYSTEM
+// ======================
+//  - All ints will be unsigned with exception of when signed is required.
+//  - Type naming simplified and shortened "A<type><#components>",
+//     - H = 16-bit float (half)
+//     - F = 32-bit float (float)
+//     - D = 64-bit float (double)
+//     - P = 1-bit integer (predicate, not using bool because 'B' is used for byte)
+//     - B = 8-bit integer (byte)
+//     - W = 16-bit integer (word)
+//     - U = 32-bit integer (unsigned)
+//     - L = 64-bit integer (long)
+//  - Using "AS<type><#components>" for signed when required.
+//------------------------------------------------------------------------------------------------------------------------------
+// TODO
+// ====
+//  - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops).
+//  - Add subgroup ops.
+//------------------------------------------------------------------------------------------------------------------------------
+// CHANGE LOG
+// ==========
+// 20190531 - Fixed changed to llabs() because long is int on Windows.
+// 20190530 - Updated for new CPU/GPU portability.
+// 20190528 - Fix AU1_AH2_x() on HLSL (had incorrectly swapped x and y), fixed asuint() cases.
+// 20190527 - Added min3/max3 for low precision for HLSL.
+// 20190526 - Updated with half approximations, added ARsq*(), and ASat*() for CPU.
+// 20190519 - Added more approximations.
+// 20190514 - Added long conversions.
+// 20190513 - Added the real BFI moved the other one to ABfiM().
+// 20190507 - Added extra remap useful for 2D reductions.
+// 20190507 - Started adding wave ops, add parabolic sin/cos.
+// 20190505 - Added ASigned*() and friends, setup more auto-typecast, GLSL extensions, etc.
+// 20190504 - Added min3/max3 for 32-bit integers.
+// 20190503 - Added type reinterpretation for half.
+// 20190416 - Added min3/max3 for half.
+// 20190405 - Misc bug fixing.
+// 20190404 - Cleaned up color conversion code. Switched "splat" to shorter naming "type_". Misc bug fixing.
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                           COMMON
+//==============================================================================================================================
+#define A_2PI 6.28318530718
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                             CPU
+//
+//
+//==============================================================================================================================
+// Requires standard C types: stdint.h
+// Requires a collection of standard math intrinsics.
+//  - Requires VS2013 when not using GCC to get exp2() and log2().
+//  - https://blogs.msdn.microsoft.com/vcblog/2013/07/19/c99-library-support-in-visual-studio-2013/
+//------------------------------------------------------------------------------------------------------------------------------
+// This provides a minimum subset of functionality compared to the GPU parts.
+//==============================================================================================================================
+#ifdef A_CPU
+ // Supporting user defined overrides.
+ #ifndef A_RESTRICT
+  #define A_RESTRICT __restrict
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifndef A_STATIC
+  #define A_STATIC static
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ // Same types across CPU and GPU.
+ // Predicate uses 32-bit integer (C friendly bool).
+ typedef uint32_t AP1;
+ typedef float AF1;
+ typedef double AD1;
+ typedef uint8_t AB1;
+ typedef uint16_t AW1;
+ typedef uint32_t AU1;
+ typedef uint64_t AL1;
+ typedef int8_t ASB1;
+ typedef int16_t ASW1;
+ typedef int32_t ASU1;
+ typedef int64_t ASL1;
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AD1_(a) ((AD1)(a))
+ #define AF1_(a) ((AF1)(a))
+ #define AL1_(a) ((AL1)(a))
+ #define AU1_(a) ((AU1)(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASL1_(a) ((ASL1)(a))
+ #define ASU1_(a) ((ASU1)(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;}
+//------------------------------------------------------------------------------------------------------------------------------
+ #define A_TRUE 1
+ #define A_FALSE 0
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                                       CPU/GPU PORTING
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// Hackary to get CPU and GPU to share all setup code, without duplicate code paths.
+// Unfortunately this is the level of "ugly" that is required since the languages are very different.
+// This uses a lower-case prefix for special vector constructs.
+//  - In C restrict pointers are used.
+//  - In the shading language, in/inout/out arguments are used.
+// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]).
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
+//==============================================================================================================================
+ #define retAD2 AD1 *A_RESTRICT
+ #define retAD3 AD1 *A_RESTRICT
+ #define retAD4 AD1 *A_RESTRICT
+ #define retAF2 AF1 *A_RESTRICT
+ #define retAF3 AF1 *A_RESTRICT
+ #define retAF4 AF1 *A_RESTRICT
+ #define retAL2 AL1 *A_RESTRICT
+ #define retAL3 AL1 *A_RESTRICT
+ #define retAL4 AL1 *A_RESTRICT
+ #define retAU2 AU1 *A_RESTRICT
+ #define retAU3 AU1 *A_RESTRICT
+ #define retAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inAD2 AD1 *A_RESTRICT
+ #define inAD3 AD1 *A_RESTRICT
+ #define inAD4 AD1 *A_RESTRICT
+ #define inAF2 AF1 *A_RESTRICT
+ #define inAF3 AF1 *A_RESTRICT
+ #define inAF4 AF1 *A_RESTRICT
+ #define inAL2 AL1 *A_RESTRICT
+ #define inAL3 AL1 *A_RESTRICT
+ #define inAL4 AL1 *A_RESTRICT
+ #define inAU2 AU1 *A_RESTRICT
+ #define inAU3 AU1 *A_RESTRICT
+ #define inAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inoutAD2 AD1 *A_RESTRICT
+ #define inoutAD3 AD1 *A_RESTRICT
+ #define inoutAD4 AD1 *A_RESTRICT
+ #define inoutAF2 AF1 *A_RESTRICT
+ #define inoutAF3 AF1 *A_RESTRICT
+ #define inoutAF4 AF1 *A_RESTRICT
+ #define inoutAL2 AL1 *A_RESTRICT
+ #define inoutAL3 AL1 *A_RESTRICT
+ #define inoutAL4 AL1 *A_RESTRICT
+ #define inoutAU2 AU1 *A_RESTRICT
+ #define inoutAU3 AU1 *A_RESTRICT
+ #define inoutAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define outAD2 AD1 *A_RESTRICT
+ #define outAD3 AD1 *A_RESTRICT
+ #define outAD4 AD1 *A_RESTRICT
+ #define outAF2 AF1 *A_RESTRICT
+ #define outAF3 AF1 *A_RESTRICT
+ #define outAF4 AF1 *A_RESTRICT
+ #define outAL2 AL1 *A_RESTRICT
+ #define outAL3 AL1 *A_RESTRICT
+ #define outAL4 AL1 *A_RESTRICT
+ #define outAU2 AU1 *A_RESTRICT
+ #define outAU3 AU1 *A_RESTRICT
+ #define outAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define varAD2(x) AD1 x[2]
+ #define varAD3(x) AD1 x[3]
+ #define varAD4(x) AD1 x[4]
+ #define varAF2(x) AF1 x[2]
+ #define varAF3(x) AF1 x[3]
+ #define varAF4(x) AF1 x[4]
+ #define varAL2(x) AL1 x[2]
+ #define varAL3(x) AL1 x[3]
+ #define varAL4(x) AL1 x[4]
+ #define varAU2(x) AU1 x[2]
+ #define varAU3(x) AU1 x[3]
+ #define varAU4(x) AU1 x[4]
+//------------------------------------------------------------------------------------------------------------------------------
+ #define initAD2(x,y) {x,y}
+ #define initAD3(x,y,z) {x,y,z}
+ #define initAD4(x,y,z,w) {x,y,z,w}
+ #define initAF2(x,y) {x,y}
+ #define initAF3(x,y,z) {x,y,z}
+ #define initAF4(x,y,z,w) {x,y,z,w}
+ #define initAL2(x,y) {x,y}
+ #define initAL3(x,y,z) {x,y,z}
+ #define initAL4(x,y,z,w) {x,y,z,w}
+ #define initAU2(x,y) {x,y}
+ #define initAU3(x,y,z) {x,y,z}
+ #define initAU4(x,y,z,w) {x,y,z,w}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     SCALAR RETURN OPS
+//------------------------------------------------------------------------------------------------------------------------------
+// TODO
+// ====
+//  - Replace transcendentals with manual versions. 
+//==============================================================================================================================
+ #ifdef A_GCC
+  A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);}
+  A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);}
+  A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));}
+  A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_labs(ASL1_(a)));}
+ #else
+  A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);}
+  A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);}
+  A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));}
+  A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(llabs(ASL1_(a)));}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);}
+  A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);}
+ #else
+  A_STATIC AD1 ACosD1(AD1 a){return cos(a);}
+  A_STATIC AF1 ACosF1(AF1 a){return cosf(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];}
+ A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
+ A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
+ A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];}
+ A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
+ A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);}
+  A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);}
+ #else
+  A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);}
+  A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);}
+  A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);}
+ #else
+  A_STATIC AD1 AFloorD1(AD1 a){return floor(a);}
+  A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);}
+ A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);}
+  A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);}
+ #else
+  A_STATIC AD1 ALog2D1(AD1 a){return log2(a);}
+  A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;}
+ A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;}
+ A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;}
+ A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ // These follow the convention that A integer types don't have signage, until they are operated on. 
+ A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;}
+ A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;}
+ A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;}
+ A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;}
+ A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;}
+ A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;}
+ A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));}
+ A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);}
+  A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);}
+ #else
+  A_STATIC AD1 ASinD1(AD1 a){return sin(a);}
+  A_STATIC AF1 ASinF1(AF1 a){return sinf(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);}
+  A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);}
+ #else
+  A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);}
+  A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                               SCALAR RETURN OPS - DEPENDENT
+//==============================================================================================================================
+ A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);}
+ A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));}
+ A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));}
+ A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));}
+ A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         VECTOR OPS
+//------------------------------------------------------------------------------------------------------------------------------
+// These are added as needed for production or prototyping, so not necessarily a complete set.
+// They follow a convention of taking in a destination and also returning the destination value to increase utility.
+//==============================================================================================================================
+ A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;}
+ A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;}
+ A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;}
+ A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;}
+ A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
+ A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
+ A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
+ A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
+ A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;}
+ A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
+ A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;}
+ A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
+ A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;}
+ A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;}
+ A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;}
+ A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;}
+ A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;}
+ A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;}
+ A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;}
+ A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;}
+ A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;}
+ A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;}
+ A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;}
+ A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;}
+ A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;}
+ A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;}
+ A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;}
+ A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;}
+ A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
+ A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
+ A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
+ A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
+ A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
+ A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
+ A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
+ A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
+ A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;}
+ A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
+ A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;}
+ A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
+ A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;}
+ A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;}
+ A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;}
+ A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;}
+ A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     HALF FLOAT PACKING
+//==============================================================================================================================
+ // Convert float to half (in lower 16-bits of output).
+ // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+ // Supports denormals.
+ // Conversion rules are to make computations possibly "safer" on the GPU,
+ //  -INF & -NaN -> -65504
+ //  +INF & +NaN -> +65504
+ A_STATIC AU1 AU1_AH1_AF1(AF1 f){
+  static AW1 base[512]={
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
+   0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
+   0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
+   0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
+   0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff};
+  static AB1 shift[512]={
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
+   0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
+   0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
+   0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
+   0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18};
+  union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Used to output packed constant.
+ A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                            GLSL
+//
+//
+//==============================================================================================================================
+#if defined(A_GLSL) && defined(A_GPU)
+ #ifndef A_SKIP_EXT
+  #ifdef A_HALF
+   #extension GL_EXT_shader_16bit_storage:require
+   #extension GL_EXT_shader_explicit_arithmetic_types:require 
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_LONG
+   #extension GL_ARB_gpu_shader_int64:require
+   // TODO: Fixme to more portable extension!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   #extension GL_NV_shader_atomic_int64:require
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_WAVE
+   #extension GL_KHR_shader_subgroup_arithmetic:require
+   #extension GL_KHR_shader_subgroup_ballot:require
+   #extension GL_KHR_shader_subgroup_quad:require
+   #extension GL_KHR_shader_subgroup_shuffle:require
+  #endif
+ #endif
+//==============================================================================================================================
+ #define AP1 bool
+ #define AP2 bvec2
+ #define AP3 bvec3
+ #define AP4 bvec4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AF1 float
+ #define AF2 vec2
+ #define AF3 vec3
+ #define AF4 vec4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1 uint
+ #define AU2 uvec2
+ #define AU3 uvec3
+ #define AU4 uvec4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASU1 int
+ #define ASU2 ivec2
+ #define ASU3 ivec3
+ #define ASU4 ivec4
+//==============================================================================================================================
+ #define AF1_AU1(x) uintBitsToFloat(AU1(x))
+ #define AF2_AU2(x) uintBitsToFloat(AU2(x))
+ #define AF3_AU3(x) uintBitsToFloat(AU3(x))
+ #define AF4_AU4(x) uintBitsToFloat(AU4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1_AF1(x) floatBitsToUint(AF1(x))
+ #define AU2_AF2(x) floatBitsToUint(AF2(x))
+ #define AU3_AF3(x) floatBitsToUint(AF3(x))
+ #define AU4_AF4(x) floatBitsToUint(AF4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1_AH2_AF2 packHalf2x16
+ #define AU1_AW2Unorm_AF2 packUnorm2x16
+ #define AU1_AB4Unorm_AF4 packUnorm4x8
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AF2_AH2_AU1 unpackHalf2x16
+ #define AF2_AW2Unorm_AU1 unpackUnorm2x16
+ #define AF4_AB4Unorm_AU1 unpackUnorm4x8
+//==============================================================================================================================
+ AF1 AF1_x(AF1 a){return AF1(a);}
+ AF2 AF2_x(AF1 a){return AF2(a,a);}
+ AF3 AF3_x(AF1 a){return AF3(a,a,a);}
+ AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
+ #define AF1_(a) AF1_x(AF1(a))
+ #define AF2_(a) AF2_x(AF1(a))
+ #define AF3_(a) AF3_x(AF1(a))
+ #define AF4_(a) AF4_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_x(AU1 a){return AU1(a);}
+ AU2 AU2_x(AU1 a){return AU2(a,a);}
+ AU3 AU3_x(AU1 a){return AU3(a,a,a);}
+ AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
+ #define AU1_(a) AU1_x(AU1(a))
+ #define AU2_(a) AU2_x(AU1(a))
+ #define AU3_(a) AU3_x(AU1(a))
+ #define AU4_(a) AU4_x(AU1(a))
+//==============================================================================================================================
+ AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
+ AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
+ AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
+ AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));}
+ AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
+ // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate.
+ AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_FRACT_F32 (note DX frac() is different).
+ AF1 AFractF1(AF1 x){return fract(x);}
+ AF2 AFractF2(AF2 x){return fract(x);}
+ AF3 AFractF3(AF3 x){return fract(x);}
+ AF4 AFractF4(AF4 x){return fract(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);}
+ AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);}
+ AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);}
+ AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_MAX3_F32.
+ AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
+ AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
+ AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
+ AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
+ AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
+ AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
+ AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
+ AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
+ AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
+ AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
+ AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
+ AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
+ AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Clamp has an easier pattern match for med3 when some ordering is known.
+ // V_MED3_F32.
+ AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
+ AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
+ AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
+ AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_MIN3_F32.
+ AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
+ AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
+ AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
+ AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
+ AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
+ AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
+ AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
+ AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
+ AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
+ AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
+ AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
+ AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
+ AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
+ // V_COS_F32.
+ AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
+ AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
+ AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
+ AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
+ // V_SIN_F32.
+ AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
+ AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
+ AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
+ AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;}
+ AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;}
+ AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;}
+ AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);}
+ AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);}
+ AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);}
+ AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));}
+ AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));}
+ AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));}
+ AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
+ AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
+ AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
+ AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          GLSL BYTE
+//==============================================================================================================================
+ #ifdef A_BYTE
+  #define AB1 uint8_t
+  #define AB2 u8vec2
+  #define AB3 u8vec3
+  #define AB4 u8vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASB1 int8_t
+  #define ASB2 i8vec2
+  #define ASB3 i8vec3
+  #define ASB4 i8vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  AB1 AB1_x(AB1 a){return AB1(a);}
+  AB2 AB2_x(AB1 a){return AB2(a,a);}
+  AB3 AB3_x(AB1 a){return AB3(a,a,a);}
+  AB4 AB4_x(AB1 a){return AB4(a,a,a,a);}
+  #define AB1_(a) AB1_x(AB1(a))
+  #define AB2_(a) AB2_x(AB1(a))
+  #define AB3_(a) AB3_x(AB1(a))
+  #define AB4_(a) AB4_x(AB1(a))
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          GLSL HALF
+//==============================================================================================================================
+ #ifdef A_HALF
+  #define AH1 float16_t
+  #define AH2 f16vec2
+  #define AH3 f16vec3
+  #define AH4 f16vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AW1 uint16_t
+  #define AW2 u16vec2
+  #define AW3 u16vec3
+  #define AW4 u16vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASW1 int16_t
+  #define ASW2 i16vec2
+  #define ASW3 i16vec3
+  #define ASW4 i16vec4
+//==============================================================================================================================
+  #define AH2_AU1(x) unpackFloat2x16(AU1(x))
+  AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));}
+  #define AH4_AU2(x) AH4_AU2_x(AU2(x))
+  #define AW2_AU1(x) unpackUint2x16(AU1(x))
+  #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x)))
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AU1_AH2(x) packFloat2x16(AH2(x))
+  AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));}
+  #define AU2_AH4(x) AU2_AH4_x(AH4(x))
+  #define AU1_AW2(x) packUint2x16(AW2(x))
+  #define AU2_AW4(x) unpack32(packUint4x16(AW4(x)))
+//==============================================================================================================================
+  #define AW1_AH1(x) halfBitsToUint16(AH1(x))
+  #define AW2_AH2(x) halfBitsToUint16(AH2(x))
+  #define AW3_AH3(x) halfBitsToUint16(AH3(x))
+  #define AW4_AH4(x) halfBitsToUint16(AH4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AH1_AW1(x) uint16BitsToHalf(AW1(x))
+  #define AH2_AW2(x) uint16BitsToHalf(AW2(x))
+  #define AH3_AW3(x) uint16BitsToHalf(AW3(x))
+  #define AH4_AW4(x) uint16BitsToHalf(AW4(x))
+//==============================================================================================================================
+  AH1 AH1_x(AH1 a){return AH1(a);}
+  AH2 AH2_x(AH1 a){return AH2(a,a);}
+  AH3 AH3_x(AH1 a){return AH3(a,a,a);}
+  AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
+  #define AH1_(a) AH1_x(AH1(a))
+  #define AH2_(a) AH2_x(AH1(a))
+  #define AH3_(a) AH3_x(AH1(a))
+  #define AH4_(a) AH4_x(AH1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AW1_x(AW1 a){return AW1(a);}
+  AW2 AW2_x(AW1 a){return AW2(a,a);}
+  AW3 AW3_x(AW1 a){return AW3(a,a,a);}
+  AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
+  #define AW1_(a) AW1_x(AW1(a))
+  #define AW2_(a) AW2_x(AW1(a))
+  #define AW3_(a) AW3_x(AW1(a))
+  #define AW4_(a) AW4_x(AW1(a))
+//==============================================================================================================================
+  AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
+  AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
+  AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
+  AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AFractH1(AH1 x){return fract(x);}
+  AH2 AFractH2(AH2 x){return fract(x);}
+  AH3 AFractH3(AH3 x){return fract(x);}
+  AH4 AFractH4(AH4 x){return fract(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);}
+  AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);}
+  AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);}
+  AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  // No packed version of max3.
+  AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
+  AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
+  AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
+  AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
+  AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
+  AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
+  AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // No packed version of min3.
+  AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
+  AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
+  AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
+  AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
+  AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
+  AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
+  AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;}
+  AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;}
+  AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;}
+  AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);}
+  AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);}
+  AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);}
+  AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));}
+  AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));}
+  AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));}
+  AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
+  AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
+  AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
+  AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         GLSL DOUBLE
+//==============================================================================================================================
+ #ifdef A_DUBL
+  #define AD1 double
+  #define AD2 dvec2
+  #define AD3 dvec3
+  #define AD4 dvec4
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 AD1_x(AD1 a){return AD1(a);}
+  AD2 AD2_x(AD1 a){return AD2(a,a);}
+  AD3 AD3_x(AD1 a){return AD3(a,a,a);}
+  AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
+  #define AD1_(a) AD1_x(AD1(a))
+  #define AD2_(a) AD2_x(AD1(a))
+  #define AD3_(a) AD3_x(AD1(a))
+  #define AD4_(a) AD4_x(AD1(a))
+//==============================================================================================================================
+  AD1 AFractD1(AD1 x){return fract(x);}
+  AD2 AFractD2(AD2 x){return fract(x);}
+  AD3 AFractD3(AD3 x){return fract(x);}
+  AD4 AFractD4(AD4 x){return fract(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);}
+  AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);}
+  AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);}
+  AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;}
+  AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;}
+  AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;}
+  AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);}
+  AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);}
+  AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);}
+  AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));}
+  AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));}
+  AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));}
+  AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         GLSL LONG
+//==============================================================================================================================
+ #ifdef A_LONG
+  #define AL1 uint64_t
+  #define AL2 u64vec2
+  #define AL3 u64vec3
+  #define AL4 u64vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASL1 int64_t
+  #define ASL2 i64vec2
+  #define ASL3 i64vec3
+  #define ASL4 i64vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AL1_AU2(x) packUint2x32(AU2(x))
+  #define AU2_AL1(x) unpackUint2x32(AL1(x))
+//------------------------------------------------------------------------------------------------------------------------------
+  AL1 AL1_x(AL1 a){return AL1(a);}
+  AL2 AL2_x(AL1 a){return AL2(a,a);}
+  AL3 AL3_x(AL1 a){return AL3(a,a,a);}
+  AL4 AL4_x(AL1 a){return AL4(a,a,a,a);}
+  #define AL1_(a) AL1_x(AL1(a))
+  #define AL2_(a) AL2_x(AL1(a))
+  #define AL3_(a) AL3_x(AL1(a))
+  #define AL4_(a) AL4_x(AL1(a))
+//==============================================================================================================================
+  AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));}
+  AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));}
+  AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));}
+  AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));}
+  AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));}
+  AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));}
+  AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));}
+  AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));}
+  AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));}
+  AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      WAVE OPERATIONS
+//==============================================================================================================================
+ #ifdef A_WAVE
+  AF1 AWaveAdd(AF1 v){return subgroupAdd(v);}
+  AF2 AWaveAdd(AF2 v){return subgroupAdd(v);}
+  AF3 AWaveAdd(AF3 v){return subgroupAdd(v);}
+  AF4 AWaveAdd(AF4 v){return subgroupAdd(v);}
+ #endif
+//==============================================================================================================================
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                            HLSL
+//
+//
+//==============================================================================================================================
+#if defined(A_HLSL) && defined(A_GPU)
+ #define AP1 bool
+ #define AP2 bool2
+ #define AP3 bool3
+ #define AP4 bool4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AF1 float
+ #define AF2 float2
+ #define AF3 float3
+ #define AF4 float4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1 uint
+ #define AU2 uint2
+ #define AU3 uint3
+ #define AU4 uint4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASU1 int
+ #define ASU2 int2
+ #define ASU3 int3
+ #define ASU4 int4
+//==============================================================================================================================
+ #define AF1_AU1(x) asfloat(AU1(x))
+ #define AF2_AU2(x) asfloat(AU2(x))
+ #define AF3_AU3(x) asfloat(AU3(x))
+ #define AF4_AU4(x) asfloat(AU4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1_AF1(x) asuint(AF1(x))
+ #define AU2_AF2(x) asuint(AF2(x))
+ #define AU3_AF3(x) asuint(AF3(x))
+ #define AU4_AF4(x) asuint(AF4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);}
+ #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) 
+ #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));}
+ #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x))
+//==============================================================================================================================
+ AF1 AF1_x(AF1 a){return AF1(a);}
+ AF2 AF2_x(AF1 a){return AF2(a,a);}
+ AF3 AF3_x(AF1 a){return AF3(a,a,a);}
+ AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
+ #define AF1_(a) AF1_x(AF1(a))
+ #define AF2_(a) AF2_x(AF1(a))
+ #define AF3_(a) AF3_x(AF1(a))
+ #define AF4_(a) AF4_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_x(AU1 a){return AU1(a);}
+ AU2 AU2_x(AU1 a){return AU2(a,a);}
+ AU3 AU3_x(AU1 a){return AU3(a,a,a);}
+ AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
+ #define AU1_(a) AU1_x(AU1(a))
+ #define AU2_(a) AU2_x(AU1(a))
+ #define AU3_(a) AU3_x(AU1(a))
+ #define AU4_(a) AU4_x(AU1(a))
+//==============================================================================================================================
+ AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
+ AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
+ AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
+ AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1<<bits)-1;return (src>>off)&mask;}
+ AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
+ AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1<<bits)-1;return (ins&mask)|(src&(~mask));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AFractF1(AF1 x){return x-floor(x);}
+ AF2 AFractF2(AF2 x){return x-floor(x);}
+ AF3 AFractF3(AF3 x){return x-floor(x);}
+ AF4 AFractF4(AF4 x){return x-floor(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);}
+ AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);}
+ AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);}
+ AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
+ AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
+ AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
+ AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
+ AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
+ AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
+ AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
+ AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
+ AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
+ AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
+ AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
+ AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
+ AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
+ AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
+ AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
+ AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
+ AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
+ AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
+ AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
+ AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
+ AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
+ AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
+ AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
+ AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
+ AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
+ AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
+ AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
+ AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
+ AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
+ AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
+ AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
+ AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
+ AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
+ AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARcpF1(AF1 x){return rcp(x);}
+ AF2 ARcpF2(AF2 x){return rcp(x);}
+ AF3 ARcpF3(AF3 x){return rcp(x);}
+ AF4 ARcpF4(AF4 x){return rcp(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARsqF1(AF1 x){return rsqrt(x);}
+ AF2 ARsqF2(AF2 x){return rsqrt(x);}
+ AF3 ARsqF3(AF3 x){return rsqrt(x);}
+ AF4 ARsqF4(AF4 x){return rsqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ASatF1(AF1 x){return saturate(x);}
+ AF2 ASatF2(AF2 x){return saturate(x);}
+ AF3 ASatF3(AF3 x){return saturate(x);}
+ AF4 ASatF4(AF4 x){return saturate(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
+ AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
+ AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
+ AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          HLSL BYTE
+//==============================================================================================================================
+ #ifdef A_BYTE
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          HLSL HALF
+//==============================================================================================================================
+ #ifdef A_HALF
+  #define AH1 min16float
+  #define AH2 min16float2
+  #define AH3 min16float3
+  #define AH4 min16float4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AW1 min16uint
+  #define AW2 min16uint2
+  #define AW3 min16uint3
+  #define AW4 min16uint4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASW1 min16int
+  #define ASW2 min16int2
+  #define ASW3 min16int3
+  #define ASW4 min16int4
+//==============================================================================================================================
+  // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly).
+  // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/
+  AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);}
+  AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));}
+  AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);}
+  AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));}
+  #define AH2_AU1(x) AH2_AU1_x(AU1(x))
+  #define AH4_AU2(x) AH4_AU2_x(AU2(x))
+  #define AW2_AU1(x) AW2_AU1_x(AU1(x))
+  #define AW4_AU2(x) AW4_AU2_x(AU2(x))
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);}
+  AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));}
+  AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);}
+  AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));}
+  #define AU1_AH2(x) AU1_AH2_x(AH2(x))
+  #define AU2_AH4(x) AU2_AH4_x(AH4(x))
+  #define AU1_AW2(x) AU1_AW2_x(AW2(x))
+  #define AU2_AW4(x) AU2_AW4_x(AW4(x))
+//==============================================================================================================================
+  // TODO: These are broken!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+  #define AW1_AH1(x) AW1(asuint(AF1(x)))
+  #define AW2_AH2(x) AW2(asuint(AF2(x)))
+  #define AW3_AH3(x) AW3(asuint(AF3(x)))
+  #define AW4_AH4(x) AW4(asuint(AF4(x)))
+//------------------------------------------------------------------------------------------------------------------------------
+  // TODO: These are broken!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+  #define AH1_AW1(x) AH1(asfloat(AU1(x)))
+  #define AH2_AW2(x) AH2(asfloat(AU2(x)))
+  #define AH3_AW3(x) AH3(asfloat(AU3(x)))
+  #define AH4_AW4(x) AH4(asfloat(AU4(x)))
+//==============================================================================================================================
+  AH1 AH1_x(AH1 a){return AH1(a);}
+  AH2 AH2_x(AH1 a){return AH2(a,a);}
+  AH3 AH3_x(AH1 a){return AH3(a,a,a);}
+  AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
+  #define AH1_(a) AH1_x(AH1(a))
+  #define AH2_(a) AH2_x(AH1(a))
+  #define AH3_(a) AH3_x(AH1(a))
+  #define AH4_(a) AH4_x(AH1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AW1_x(AW1 a){return AW1(a);}
+  AW2 AW2_x(AW1 a){return AW2(a,a);}
+  AW3 AW3_x(AW1 a){return AW3(a,a,a);}
+  AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
+  #define AW1_(a) AW1_x(AW1(a))
+  #define AW2_(a) AW2_x(AW1(a))
+  #define AW3_(a) AW3_x(AW1(a))
+  #define AW4_(a) AW4_x(AW1(a))
+//==============================================================================================================================
+  AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
+  AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
+  AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
+  AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_FRACT_F16 (note DX frac() is different).
+  AH1 AFractH1(AH1 x){return x-floor(x);}
+  AH2 AFractH2(AH2 x){return x-floor(x);}
+  AH3 AFractH3(AH3 x){return x-floor(x);}
+  AH4 AFractH4(AH4 x){return x-floor(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);}
+  AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);}
+  AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);}
+  AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
+  AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
+  AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
+  AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
+  AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
+  AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
+  AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
+  AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
+  AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
+  AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
+  AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
+  AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
+  AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARcpH1(AH1 x){return rcp(x);}
+  AH2 ARcpH2(AH2 x){return rcp(x);}
+  AH3 ARcpH3(AH3 x){return rcp(x);}
+  AH4 ARcpH4(AH4 x){return rcp(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARsqH1(AH1 x){return rsqrt(x);}
+  AH2 ARsqH2(AH2 x){return rsqrt(x);}
+  AH3 ARsqH3(AH3 x){return rsqrt(x);}
+  AH4 ARsqH4(AH4 x){return rsqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASatH1(AH1 x){return saturate(x);}
+  AH2 ASatH2(AH2 x){return saturate(x);}
+  AH3 ASatH3(AH3 x){return saturate(x);}
+  AH4 ASatH4(AH4 x){return saturate(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
+  AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
+  AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
+  AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         HLSL DOUBLE
+//==============================================================================================================================
+ #ifdef A_DUBL
+  #define AD1 double
+  #define AD2 double2
+  #define AD3 double3
+  #define AD4 double4
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 AD1_x(AD1 a){return AD1(a);}
+  AD2 AD2_x(AD1 a){return AD2(a,a);}
+  AD3 AD3_x(AD1 a){return AD3(a,a,a);}
+  AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
+  #define AD1_(a) AD1_x(AD1(a))
+  #define AD2_(a) AD2_x(AD1(a))
+  #define AD3_(a) AD3_x(AD1(a))
+  #define AD4_(a) AD4_x(AD1(a))
+//==============================================================================================================================
+  AD1 AFractD1(AD1 a){return a-floor(a);}
+  AD2 AFractD2(AD2 a){return a-floor(a);}
+  AD3 AFractD3(AD3 a){return a-floor(a);}
+  AD4 AFractD4(AD4 a){return a-floor(a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);}
+  AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);}
+  AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);}
+  AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARcpD1(AD1 x){return rcp(x);}
+  AD2 ARcpD2(AD2 x){return rcp(x);}
+  AD3 ARcpD3(AD3 x){return rcp(x);}
+  AD4 ARcpD4(AD4 x){return rcp(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARsqD1(AD1 x){return rsqrt(x);}
+  AD2 ARsqD2(AD2 x){return rsqrt(x);}
+  AD3 ARsqD3(AD3 x){return rsqrt(x);}
+  AD4 ARsqD4(AD4 x){return rsqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ASatD1(AD1 x){return saturate(x);}
+  AD2 ASatD2(AD2 x){return saturate(x);}
+  AD3 ASatD3(AD3 x){return saturate(x);}
+  AD4 ASatD4(AD4 x){return saturate(x);}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         HLSL LONG
+//==============================================================================================================================
+ #ifdef A_LONG
+ #endif
+//==============================================================================================================================
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                          GPU COMMON
+//
+//
+//==============================================================================================================================
+#ifdef A_GPU
+ // Negative and positive infinity.
+ #define A_INFN_F AF1_AU1(0x7f800000u)
+ #define A_INFP_F AF1_AU1(0xff800000u)
+//------------------------------------------------------------------------------------------------------------------------------
+ // Copy sign from 's' to positive 'd'.
+ AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));}
+ AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));}
+ AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));}
+ AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Single operation to return (useful to create a mask to use in lerp for branch free logic),
+ //  m=NaN := 0
+ //  m>=0  := 0
+ //  m<0   := 1
+ // Uses the following useful floating point logic,
+ //  saturate(+a*(-INF)==-INF) := 0
+ //  saturate( 0*(-INF)== NaN) := 0
+ //  saturate(-a*(-INF)==+INF) := 1
+ AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));}
+ AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));}
+ AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));}
+ AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));}
+//==============================================================================================================================
+ #ifdef A_HALF
+  #define A_INFN_H AH1_AW1(0x7c00u)
+  #define A_INFP_H AH1_AW1(0xfc00u)
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));}
+  AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));}
+  AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));}
+  AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));}
+  AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));}
+  AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));}
+  AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     HALF APPROXIMATIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// These support only positive inputs.
+// Did not see value yet in specialization for range.
+// Using quick testing, ended up mostly getting the same "best" approximation for various ranges.
+// With hardware that can co-execute transcendentals, the value in approximations could be less than expected.
+// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total.
+// And co-execution would require a compiler interleaving a lot of independent work for packed usage.
+//------------------------------------------------------------------------------------------------------------------------------
+// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total).
+// Same with sqrt(), as this could be x*rsq() (7 ops).
+//------------------------------------------------------------------------------------------------------------------------------
+// IDEAS
+// =====
+//  - Polaris hardware has 16-bit support, but non-double rate.
+//    Could be possible still get part double rate for some of this logic, 
+//    by clearing out the lower half's sign when necessary and using 32-bit ops...
+//==============================================================================================================================
+ #ifdef A_HALF
+  // Minimize squared error across full positive range, 2 ops.
+  // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output.
+  AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));}
+  AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Lower precision estimation, 1 op.
+  // Minimize squared error across {smallest normal to 16384.0}.
+  AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));}
+  AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Medium precision estimation, one Newton Raphson iteration, 3 ops.
+  AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));}
+  AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Minimize squared error across {smallest normal to 16384.0}, 2 ops.
+  AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));}
+  AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    FLOAT APPROXIMATIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN",
+//  - Idea dates back to SGI, then to Quake 3, etc.
+//  - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf
+//     - sqrt(x)=rsqrt(x)*x
+//     - rcp(x)=rsqrt(x)*rsqrt(x) for positive x
+//  - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h
+//------------------------------------------------------------------------------------------------------------------------------
+// These below are from perhaps less complete searching for optimal.
+// Used FP16 normal range for testing with +4096 32-bit step size for sampling error.
+// So these match up well with the half approximations.
+//==============================================================================================================================
+ AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));}
+ AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));}
+ AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));}
+ AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    PARABOLIC SIN & COS
+//------------------------------------------------------------------------------------------------------------------------------
+// Approximate answers to transcendental questions.
+//------------------------------------------------------------------------------------------------------------------------------
+// TODO
+// ====
+//  - Verify packed math ABS is correctly doing an AND.
+//==============================================================================================================================
+ // Valid input range is {-1 to 1} representing {0 to 2 pi}.
+ // Output range is {-1/4 to -1/4} representing {-1 to 1}.
+ AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD.
+ AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT 
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_HALF
+  // For a packed {sin,cos} pair,
+  //  - Native takes 16 clocks and 4 issue slots (no packed transcendentals).
+  //  - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed).
+  AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA
+  AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND 
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      COLOR CONVERSIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// These are all linear to/from some other space (where 'linear' has been shortened out of the function name).
+// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'.
+// These are branch free implementations.
+// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion.
+//------------------------------------------------------------------------------------------------------------------------------
+// TRANSFER FUNCTIONS
+// ==================
+// 709 ..... Rec709 used for some HDTVs
+// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native
+// Pq ...... PQ native for HDR10
+// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type
+// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations)
+//------------------------------------------------------------------------------------------------------------------------------
+// FOR PQ
+// ======
+// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2.
+// All constants are only specified to FP32 precision.
+// External PQ source reference,
+//  - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl
+//------------------------------------------------------------------------------------------------------------------------------
+// PACKED VERSIONS
+// ===============
+// These are the A*H2() functions.
+// There is no PQ functions as FP16 seemed to not have enough precision for the conversion.
+// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors.
+// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least).
+//------------------------------------------------------------------------------------------------------------------------------
+// NOTES
+// =====
+// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case.
+//==============================================================================================================================
+ AF1 ATo709F1(AF1 c){return max(min(c*AF1_(4.5),AF1_(0.018)),AF1_(1.099)*pow(c,AF1_(0.45))-AF1_(0.099));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma().
+ AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,rcpX);} 
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302));
+  return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AToSrgbF1(AF1 c){return max(min(c*AF1_(12.92),AF1_(0.0031308)),AF1_(1.055)*pow(c,AF1_(0.41666))-AF1_(0.055));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AToTwoF1(AF1 c){return sqrt(c);}
+//==============================================================================================================================
+ AF1 AFrom709F1(AF1 c){return max(min(c*AF1_(1.0/4.5),AF1_(0.081)),
+  pow((c+AF1_(0.099))*(AF1_(1.0)/(AF1_(1.099))),AF1_(1.0/0.45)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,x);} 
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833));
+  return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AFromSrgbF1(AF1 c){return max(min(c*AF1_(1.0/12.92),AF1_(0.04045)),
+  pow((c+AF1_(0.055))*(AF1_(1.0)/AF1_(1.055)),AF1_(2.4)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AFromTwoF1(AF1 c){return c*c;}
+//==============================================================================================================================
+ #ifdef A_HALF
+  AH2 ATo709H2(AH2 c){return max(min(c*AH2_(4.5),AH2_(0.018)),AH2_(1.099)*pow(c,AH2_(0.45))-AH2_(0.099));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));} 
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 AToSrgbH2(AH2 c){return max(min(c*AH2_(12.92),AH2_(0.0031308)),AH2_(1.055)*pow(c,AH2_(0.41666))-AH2_(0.055));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 AToTwoH2(AH2 c){return sqrt(c);}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  AH2 AFrom709H2(AH2 c){return max(min(c*AH2_(1.0/4.5),AH2_(0.081)),
+   pow((c+AH2_(0.099))*(AH2_(1.0)/(AH2_(1.099))),AH2_(1.0/0.45)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 AFromSrgbH2(AH2 c){return max(min(c*AH2_(1.0/12.92),AH2_(0.04045)),
+   pow((c+AH2_(0.055))*(AH2_(1.0)/AH2_(1.055)),AH2_(2.4)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 AFromTwoH2(AH2 c){return c*c;}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          CS REMAP
+//==============================================================================================================================
+ // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear.
+ //  543210
+ //  ======
+ //  ..xxx.
+ //  yy...y
+ AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
+//==============================================================================================================================
+ // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions.
+ //  543210
+ //  ======
+ //  .xx..x
+ //  y..yy.
+ // Details,
+ //  LANE TO 8x8 MAPPING
+ //  ===================
+ //  00 01 08 09 10 11 18 19 
+ //  02 03 0a 0b 12 13 1a 1b
+ //  04 05 0c 0d 14 15 1c 1d
+ //  06 07 0e 0f 16 17 1e 1f 
+ //  20 21 28 29 30 31 38 39 
+ //  22 23 2a 2b 32 33 3a 3b
+ //  24 25 2c 2d 34 35 3c 3d
+ //  26 27 2e 2f 36 37 3e 3f 
+ AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                                          REFERENCE
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// IEEE FLOAT RULES
+// ================
+//  - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1
+//  - {+/-}0 * {+/-}INF = NaN
+//  - -INF + (+INF) = NaN
+//  - {+/-}0 / {+/-}0 = NaN
+//  - {+/-}INF / {+/-}INF = NaN
+//  - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN)
+//  - 0 == -0
+//  - 4/0 = +INF
+//  - 4/-0 = -INF
+//  - 4+INF = +INF
+//  - 4-INF = -INF
+//  - 4*(+INF) = +INF
+//  - 4*(-INF) = -INF
+//  - -4*(+INF) = -INF
+//  - sqrt(+INF) = +INF
+//------------------------------------------------------------------------------------------------------------------------------
+// FP16 ENCODING
+// =============
+// fedcba9876543210
+// ----------------
+// ......mmmmmmmmmm  10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals)
+// .eeeee..........  5-bit exponent
+// .00000..........  denormals
+// .00001..........  -14 exponent
+// .11110..........   15 exponent
+// .111110000000000  infinity
+// .11111nnnnnnnnnn  NaN with n!=0
+// s...............  sign
+//------------------------------------------------------------------------------------------------------------------------------
+// FP16/INT16 ALIASING DENORMAL
+// ============================
+// 11-bit unsigned integers alias with half float denormal/normal values,
+//     1 = 2^(-24) = 1/16777216 ....................... first denormal value
+//     2 = 2^(-23)
+//   ...
+//  1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value
+//  1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers
+//  2047 .............................................. last normal value that still maps to integers 
+// Scaling limits,
+//  2^15 = 32768 ...................................... largest power of 2 scaling
+// Largest pow2 conversion mapping is at *32768,
+//     1 : 2^(-9) = 1/128
+//  1024 : 8
+//  2047 : a little less than 16
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                     GPU/CPU PORTABILITY
+//
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// This is the GPU implementation.
+// See the CPU implementation for docs.
+//==============================================================================================================================
+#ifdef A_GPU
+ #define A_TRUE true
+ #define A_FALSE false
+ #define A_STATIC
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
+//==============================================================================================================================
+ #define retAD2 AD2
+ #define retAD3 AD3
+ #define retAD4 AD4
+ #define retAF2 AF2
+ #define retAF3 AF3
+ #define retAF4 AF4
+ #define retAL2 AL2
+ #define retAL3 AL3
+ #define retAL4 AL4
+ #define retAU2 AU2
+ #define retAU3 AU3
+ #define retAU4 AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inAD2 in AD2
+ #define inAD3 in AD3
+ #define inAD4 in AD4
+ #define inAF2 in AF2
+ #define inAF3 in AF3
+ #define inAF4 in AF4
+ #define inAL2 in AL2
+ #define inAL3 in AL3
+ #define inAL4 in AL4
+ #define inAU2 in AU2
+ #define inAU3 in AU3
+ #define inAU4 in AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inoutAD2 inout AD2
+ #define inoutAD3 inout AD3
+ #define inoutAD4 inout AD4
+ #define inoutAF2 inout AF2
+ #define inoutAF3 inout AF3
+ #define inoutAF4 inout AF4
+ #define inoutAL2 inout AL2
+ #define inoutAL3 inout AL3
+ #define inoutAL4 inout AL4
+ #define inoutAU2 inout AU2
+ #define inoutAU3 inout AU3
+ #define inoutAU4 inout AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define outAD2 out AD2
+ #define outAD3 out AD3
+ #define outAD4 out AD4
+ #define outAF2 out AF2
+ #define outAF3 out AF3
+ #define outAF4 out AF4
+ #define outAL2 out AL2
+ #define outAL3 out AL3
+ #define outAL4 out AL4
+ #define outAU2 out AU2
+ #define outAU3 out AU3
+ #define outAU4 out AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define varAD2(x) AD2 x
+ #define varAD3(x) AD3 x
+ #define varAD4(x) AD4 x
+ #define varAF2(x) AF2 x
+ #define varAF3(x) AF3 x
+ #define varAF4(x) AF4 x
+ #define varAL2(x) AL2 x
+ #define varAL3(x) AL3 x
+ #define varAL4(x) AL4 x
+ #define varAU2(x) AU2 x
+ #define varAU3(x) AU3 x
+ #define varAU4(x) AU4 x
+//------------------------------------------------------------------------------------------------------------------------------
+ #define initAD2(x,y) AD2(x,y)
+ #define initAD3(x,y,z) AD3(x,y,z)
+ #define initAD4(x,y,z,w) AD4(x,y,z,w)
+ #define initAF2(x,y) AF2(x,y)
+ #define initAF3(x,y,z) AF3(x,y,z)
+ #define initAF4(x,y,z,w) AF4(x,y,z,w)
+ #define initAL2(x,y) AL2(x,y)
+ #define initAL3(x,y,z) AL3(x,y,z)
+ #define initAL4(x,y,z,w) AL4(x,y,z,w)
+ #define initAU2(x,y) AU2(x,y)
+ #define initAU3(x,y,z) AU3(x,y,z)
+ #define initAU4(x,y,z,w) AU4(x,y,z,w)
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     SCALAR RETURN OPS
+//==============================================================================================================================
+ #define AAbsD1(a) abs(AD1(a))
+ #define AAbsF1(a) abs(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ACosD1(a) cos(AD1(a))
+ #define ACosF1(a) cos(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ADotD2(a,b) dot(AD2(a),AD2(b))
+ #define ADotD3(a,b) dot(AD3(a),AD3(b))
+ #define ADotD4(a,b) dot(AD4(a),AD4(b))
+ #define ADotF2(a,b) dot(AF2(a),AF2(b))
+ #define ADotF3(a,b) dot(AF3(a),AF3(b))
+ #define ADotF4(a,b) dot(AF4(a),AF4(b))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AExp2D1(a) exp2(AD1(a))
+ #define AExp2F1(a) exp2(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AFloorD1(a) floor(AD1(a))
+ #define AFloorF1(a) floor(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ALog2D1(a) log2(AD1(a))
+ #define ALog2F1(a) log2(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AMaxD1(a,b) min(a,b)
+ #define AMaxF1(a,b) min(a,b)
+ #define AMaxL1(a,b) min(a,b)
+ #define AMaxU1(a,b) min(a,b)
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AMinD1(a,b) min(a,b)
+ #define AMinF1(a,b) min(a,b)
+ #define AMinL1(a,b) min(a,b)
+ #define AMinU1(a,b) min(a,b)
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASinD1(a) sin(AD1(a))
+ #define ASinF1(a) sin(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASqrtD1(a) sqrt(AD1(a))
+ #define ASqrtF1(a) sqrt(AF1(a))
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                               SCALAR RETURN OPS - DEPENDENT
+//==============================================================================================================================
+ #define APowD1(a,b) pow(AD1(a),AF1(b))
+ #define APowF1(a,b) pow(AF1(a),AF1(b))
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         VECTOR OPS
+//------------------------------------------------------------------------------------------------------------------------------
+// These are added as needed for production or prototyping, so not necessarily a complete set.
+// They follow a convention of taking in a destination and also returning the destination value to increase utility.
+//==============================================================================================================================
+ #ifdef A_DUBL
+  AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;}
+  AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;}
+  AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;}
+  AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;}
+  AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;}
+  AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;}
+  AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;}
+  AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;}
+  AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;}
+  AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;}
+  AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;}
+  AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;}
+  AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;}
+  AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;}
+  AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;}
+  AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;}
+  AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;}
+  AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;}
+  AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;}
+  AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;}
+  AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;}
+  AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;}
+  AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;}
+ #endif
+//==============================================================================================================================
+ AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;}
+ AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;}
+ AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;}
+ AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;}
+ AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;}
+ AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;}
+ AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;}
+ AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;}
+ AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;}
+ AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;}
+ AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;}
+ AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;}
+ AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;}
+ AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;}
+ AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;}
+ AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;}
+ AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;}
+ AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;}
+ AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;}
+ AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;}
+ AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;}
+ AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;}
+ AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;}
+#endif
\ No newline at end of file
diff --git a/bin/data/shaders/ext/ffx_spd.h b/bin/data/shaders/ext/ffx_spd.h
new file mode 100644
index 00000000..dc167261
--- /dev/null
+++ b/bin/data/shaders/ext/ffx_spd.h
@@ -0,0 +1,1297 @@
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                         [FFX SPD] Single Pass Downsampler 2.0
+//
+//==============================================================================================================================
+// LICENSE
+// =======
+// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved.
+// -------
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// -------
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+// Software.
+// -------
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// CHANGELIST v2.0
+// ===============
+// - Added support for cube and array textures. SpdDownsample and SpdDownsampleH shader functions now take index of texture slice
+//   as an additional parameter. For regular texture use 0.
+// - Added support for updating only sub-rectangle of the texture. Additional, optional parameter workGroupOffset added to shader
+//   functions SpdDownsample and SpdDownsampleH.
+// - Added C function SpdSetup that helps to setup constants to be passed as a constant buffer.
+// - The global atomic counter is automatically reset to 0 by the shader at the end, so you do not need to clear it before every
+//   use, just once after creation
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// INTEGRATION SUMMARY FOR CPU
+// ===========================
+// // you need to provide as constants:
+// // number of mip levels to be computed (maximum is 12)
+// // number of total thread groups: ((widthInPixels+63)>>6) * ((heightInPixels+63)>>6)
+// // workGroupOffset -> by default 0, if you only downsample a rectancle within the source texture use SpdSetup function to calculate correct offset
+// ...
+// // Dispatch the shader such that each thread group works on a 64x64 sub-tile of the source image
+// // for Cube Textures or Texture2DArray, use the z dimension
+// vkCmdDispatch(cmdBuf,(widthInPixels+63)>>6,(heightInPixels+63)>>6, slices);
+
+// // you can also use the SpdSetup function:
+// //on top of your cpp file:
+// #define A_CPU
+// #include "ffx_a.h"
+// #include "ffx_spd.h"
+// // before your dispatch call, use SpdSetup function to get your constants
+// varAU2(dispatchThreadGroupCountXY); // output variable
+// varAU2(workGroupOffset);  // output variable, this constants are required if Left and Top are not 0,0
+// varAU2(numWorkGroupsAndMips); // output variable
+// // input information about your source texture:
+// // left and top of the rectancle within your texture you want to downsample
+// // width and height of the rectancle you want to downsample
+// // if complete source texture should get downsampled: left = 0, top = 0, width = sourceTexture.width, height = sourceTexture.height
+// varAU4(rectInfo) = initAU4(0, 0, m_Texture.GetWidth(), m_Texture.GetHeight()); // left, top, width, height
+// SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo);
+// ...
+// // constants:
+// data.numWorkGroupsPerSlice = numWorkGroupsAndMips[0];
+// data.mips = numWorkGroupsAndMips[1];
+// data.workGroupOffset[0] = workGroupOffset[0];
+// data.workGroupOffset[1] = workGroupOffset[1];
+// ...
+// uint32_t dispatchX = dispatchThreadGroupCountXY[0];
+// uint32_t dispatchY = dispatchThreadGroupCountXY[1];
+// uint32_t dispatchZ = m_CubeTexture.GetArraySize(); // slices - for 2D Texture this is 1, for cube texture 6
+// vkCmdDispatch(cmd_buf, dispatchX, dispatchY, dispatchZ);
+
+//------------------------------------------------------------------------------------------------------------------------------
+// INTEGRATION SUMMARY FOR GPU
+// ===========================
+
+// [SAMPLER] - if you want to use a sampler with linear filtering for loading the source image
+// follow additionally the instructions marked with [SAMPLER]
+// add following define:
+// #define SPD_LINEAR_SAMPLER
+// this is recommended, as using one sample() with linear filter to reduce 2x2 is faster
+// than 4x load() plus manual averaging
+
+// // Setup layout. Example below for VK_FORMAT_R16G16B16A16_SFLOAT.
+// // Note: If you use SRGB format for UAV load() and store() (if it's supported), you need to convert to and from linear space
+// // when using UAV load() and store()
+// // approximate conversion to linear (load function): x*x
+// // approximate conversion from linear (store function): sqrt()
+// // or use more accurate functions from ffx_a.h: AFromSrgbF1(value) and AToSrgbF1(value)
+// // Recommendation: use UNORM format instead of SRGB for UAV access, and SRGB for SRV access
+// // look in the sample app to see how it's done
+
+// // source image
+// // if cube texture use image2DArray / Texture2DArray and adapt your load/store/sample calls
+// GLSL: layout(set=0,binding=0,rgba16f)uniform image2D imgSrc;
+// [SAMPLER]: layout(set=0,binding=0)uniform texture2D imgSrc;
+// HLSL: [[vk::binding(0)]] Texture2D<float4> imgSrc :register(u0);
+
+// // destination -> 12 is the maximum number of mips supported by SPD
+// GLSL: layout(set=0,binding=1,rgba16f) uniform coherent image2D imgDst[12];
+// HLSL: [[vk::binding(1)]] globallycoherent RWTexture2D<float4> imgDst[12] :register(u1);
+
+// // global atomic counter - MUST be initialized to 0
+// // SPD resets the counter back after each run by calling SpdResetAtomicCounter(slice)
+// // if you have more than 1 slice (== if you downsample a cube texture or a texture2Darray)
+// // you have an array of counters: counter[6] -> if you have 6 slices for example
+// // GLSL:
+// layout(std430, set=0, binding=2) coherent buffer SpdGlobalAtomicBuffer
+// {
+//    uint counter;
+// } spdGlobalAtomic;
+// // HLSL:
+// struct SpdGlobalAtomicBuffer
+// {
+//    uint counter;
+// };
+// [[vk::binding(2)]] globallycoherent RWStructuredBuffer<SpdGlobalAtomicBuffer> spdGlobalAtomic;
+
+// // [SAMPLER] add sampler
+// GLSL: layout(set=0, binding=3) uniform sampler srcSampler;
+// HLSL: [[vk::binding(3)]] SamplerState srcSampler :register(s0);
+
+// // constants - either push constant or constant buffer
+// // or calculate within shader
+// // [SAMPLER] when using sampler add inverse source image size
+// // GLSL:
+// layout(push_constant) uniform SpdConstants {
+//    uint mips; // needed to opt out earlier if mips are < 12
+//    uint numWorkGroups; // number of total thread groups, so numWorkGroupsX * numWorkGroupsY * 1
+//                        // it is important to NOT take the number of slices (z dimension) into account here
+//                        // as each slice has its own counter!
+//    vec2 workGroupOffset; // optional - use SpdSetup() function to calculate correct workgroup offset
+// } spdConstants;
+// // HLSL:
+// [[vk::push_constant]]
+// cbuffer spdConstants {
+//    uint mips;
+//    uint numWorkGroups;
+//    float2 workGroupOffset; // optional
+// };
+
+// ...
+// // Setup pre-portability-header defines (sets up GLSL/HLSL path, etc)
+// #define A_GPU 1
+// #define A_GLSL 1 // or // #define A_HLSL 1
+
+// // if you want to use PACKED version
+// // recommended if bpc <= 16bit
+// #define A_HALF
+
+// ...
+// // Include the portability header (or copy it in without an include).
+// #include "ffx_a.h"
+// ...
+
+// // Define LDS variables
+// shared AF4 spdIntermediate[16][16]; // HLSL: groupshared
+// shared AU1 spdCounter; // HLSL: groupshared
+// // PACKED version
+// shared AH4 spdIntermediate[16][16]; // HLSL: groupshared
+// // Note: You can also use
+// shared AF1 spdIntermediateR[16][16];
+// shared AF1 spdIntermediateG[16][16];
+// shared AF1 spdIntermediateB[16][16];
+// shared AF1 spdIntermediateA[16][16];
+// // or for Packed version:
+// shared AH2 spdIntermediateRG[16][16];
+// shared AH2 spdIntermediateBA[16][16];
+// // This is potentially faster
+// // Adapt your load and store functions accordingly
+
+// // if subgroup operations are not supported / can't use SM6.0
+// #define SPD_NO_WAVE_OPERATIONS
+
+// // Define the fetch function(s) and the reduction function
+// // if non-power-of-2 textures, add border controls to the load and store functions
+// // to make sure the borders of the mip level look as you want it
+// // if you don't add border controls you'll read zeros past the border
+// // if you load with a sampler, this is obv. handled by your sampler :)
+// // this is also the place where you need to do color space transformation if needed
+// // E.g. if your texture format is SRGB/UNORM and you use the UAV load and store functions
+// // no automatic to/from linear conversions are happening
+// // there is to/from linear conversions when using a sampler and render target approach
+// // conversion to linear (load function): x*x
+// // conversion from linear (store function): sqrt()
+
+// AU1 slice parameter is for Cube textures and texture2DArray
+// if downsampling Texture2D you can ignore this parameter, otherwise use it to access correct slice
+// // Load from source image
+// GLSL: AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){return imageLoad(imgSrc, p);}
+// HLSL: AF4 SpdLoadSourceImage(ASU2 tex, AU1 slice){return imgSrc[tex];}
+// [SAMPLER] don't forget to add the define #SPD_LINEAR_SAMPLER :)
+// GLSL:
+// AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){
+//    AF2 textureCoord = p * invInputSize + invInputSize;
+//    return texture(sampler2D(imgSrc, srcSampler), textureCoord);
+// }
+// HLSL:
+// AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){
+//    AF2 textureCoord = p * invInputSize + invInputSize;
+//    return imgSrc.SampleLevel(srcSampler, textureCoord, 0);
+// }
+
+// // SpdLoad() takes a 32-bit signed integer 2D coordinate and loads color.
+// // Loads the 5th mip level, each value is computed by a different thread group
+// // last thread group will access all its elements and compute the subsequent mips
+// // reminder: if non-power-of-2 textures, add border controls if you do not want to read zeros past the border
+// GLSL: AF4 SpdLoad(ASU2 p, AU1 slice){return imageLoad(imgDst[5],p);}
+// HLSL: AF4 SpdLoad(ASU2 tex, AU1 slice){return imgDst[5][tex];}
+
+// Define the store function
+// GLSL: void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice){imageStore(imgDst[mip], p, value);}
+// HLSL: void SpdStore(ASU2 pix, AF4 value, AU1 mip, AU1 slice){imgDst[mip][pix] = value;}
+
+// // Define the atomic counter increase function
+// // each slice only reads and stores to its specific slice counter
+// // so, if you have several slices it's
+// // InterlockedAdd(spdGlobalAtomic[0].counter[slice], 1, spdCounter);
+// // GLSL:
+// void SpdIncreaseAtomicCounter(AU1 slice){spdCounter = atomicAdd(spdGlobalAtomic.counter, 1);}
+// AU1 SpdGetAtomicCounter() {return spdCounter;}
+// void SpdResetAtomicCounter(AU1 slice){spdGlobalAtomic.counter[slice] = 0;}
+// // HLSL:
+// void SpdIncreaseAtomicCounter(AU1 slice){InterlockedAdd(spdGlobalAtomic[0].counter, 1, spdCounter);}
+// AU1 SpdGetAtomicCounter(){return spdCounter;}
+// void SpdResetAtomicCounter(AU1 slice){spdGlobalAtomic[0].counter[slice] = 0;}
+
+// // Define the LDS load and store functions
+// // GLSL:
+// AF4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];}
+// void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){spdIntermediate[x][y] = value;}
+// // HLSL:
+// AF4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];}
+// void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){spdIntermediate[x][y] = value;}
+
+// // Define your reduction function: takes as input the four 2x2 values and returns 1 output value
+// Example below: computes the average value
+// AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3){return (v0+v1+v2+v3)*0.25;}
+
+// // PACKED VERSION
+// Load from source image
+// GLSL: AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){return AH4(imageLoad(imgSrc, p));}
+// HLSL: AH4 SpdLoadSourceImageH(ASU2 tex, AU1 slice){return AH4(imgSrc[tex]);}
+// [SAMPLER]
+// GLSL:
+// AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){
+//    AF2 textureCoord = p * invInputSize + invInputSize;
+//    return AH4(texture(sampler2D(imgSrc, srcSampler), textureCoord));
+// }
+// HLSL:
+// AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){
+//    AF2 textureCoord = p * invInputSize + invInputSize;
+//    return AH4(imgSrc.SampleLevel(srcSampler, textureCoord, 0));
+// }
+
+// // SpdLoadH() takes a 32-bit signed integer 2D coordinate and loads color.
+// // Loads the 5th mip level, each value is computed by a different thread group
+// // last thread group will access all its elements and compute the subsequent mips
+// GLSL: AH4 SpdLoadH(ASU2 p, AU1 slice){return AH4(imageLoad(imgDst[5],p));}
+// HLSL: AH4 SpdLoadH(ASU2 tex, AU1 slice){return AH4(imgDst[5][tex]);}
+
+// Define the store function
+// GLSL: void SpdStoreH(ASU2 p, AH4 value, AU1 mip, AU1 slice){imageStore(imgDst[mip], p, AF4(value));}
+// HLSL: void SpdStoreH(ASU2 pix, AH4 value, AU1 index, AU1 slice){imgDst[index][pix] = AF4(value);}
+
+// // Define the atomic counter increase function
+// // GLSL:
+// void SpdIncreaseAtomicCounter(AU1 slice){spd_counter = atomicAdd(spdGlobalAtomic.counter, 1);}
+// AU1 SpdGetAtomicCounter() {return spdCounter;}
+// // HLSL:
+// void SpdIncreaseAtomicCounter(AU1 slice){InterlockedAdd(spdGlobalAtomic[0].counter, 1, spdCounter);}
+// AU1 SpdGetAtomicCounter(){return spdCounter;}
+
+// // Define the LDS load and store functions
+// // GLSL:
+// AH4 SpdLoadIntermediateH(AU1 x, AU1 y){return spdIntermediate[x][y];}
+// void SpdStoreIntermediateH(AU1 x, AU1 y, AH4 value){spdIntermediate[x][y] = value;}
+// // HLSL:
+// AH4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];}
+// void SpdStoreIntermediate(AU1 x, AU1 y, AH4 value){spdIntermediate[x][y] = value;}
+
+// // Define your reduction function: takes as input the four 2x2 values and returns 1 output value
+// Example below: computes the average value
+// AH4 SpdReduce4H(AH4 v0, AH4 v1, AH4 v2, AH4 v3){return (v0+v1+v2+v3)*AH1(0.25);}
+
+// //
+
+// // If you only use PACKED version
+// #define SPD_PACKED_ONLY
+
+// // Include this SPD (single pass downsampler) header file (or copy it in without an include).
+// #include "ffx_spd.h"
+// ...
+
+// // Example in shader integration
+// // GLSL:
+// layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+// void main(){
+//  // Call the downsampling function
+// // WorkGroupId.z should be 0 if you only downsample a Texture2D!
+//  SpdDownsample(AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex), 
+//    AU1(spdConstants.mips), AU1(spdConstants.numWorkGroups), AU1(WorkGroupId.z));
+//
+// // PACKED:
+//  SpdDownsampleH(AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex), 
+//    AU1(spdConstants.mips), AU1(spdConstants.numWorkGroups), AU1(WorkGroupId.z));
+// ...
+// // HLSL:
+// [numthreads(256,1,1)]
+// void main(uint3 WorkGroupId : SV_GroupID, uint LocalThreadIndex : SV_GroupIndex) {
+//  SpdDownsample(AU2(WorkGroupId.xy), AU1(LocalThreadIndex),  
+//    AU1(mips), AU1(numWorkGroups), AU1(WorkGroupId.z));
+//
+// // PACKED:
+//  SpdDownsampleH(AU2(WorkGroupId.xy), AU1(LocalThreadIndex),  
+//    AU1(mips), AU1(numWorkGroups), AU1(WorkGroupId.z));
+// ...
+
+//
+//------------------------------------------------------------------------------------------------------------------------------
+
+//==============================================================================================================================
+//                                                     SPD Setup
+//==============================================================================================================================
+#ifdef A_CPU
+A_STATIC void SpdSetup(
+outAU2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
+outAU2 workGroupOffset, // GPU side: pass in as constant
+outAU2 numWorkGroupsAndMips, // GPU side: pass in as constant
+inAU4 rectInfo, // left, top, width, height
+ASU1 mips // optional: if -1, calculate based on rect width and height
+){
+    workGroupOffset[0] = rectInfo[0] / 64; // rectInfo[0] = left
+    workGroupOffset[1] = rectInfo[1] / 64; // rectInfo[1] = top
+
+    AU1 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64; // rectInfo[0] = left, rectInfo[2] = width
+    AU1 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64; // rectInfo[1] = top, rectInfo[3] = height
+
+    dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0];
+    dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1];
+
+    numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]);
+
+    if (mips >= 0) {
+        numWorkGroupsAndMips[1] = AU1(mips);
+    } else { // calculate based on rect width and height
+        AU1 resolution = AMaxU1(rectInfo[2], rectInfo[3]);
+        numWorkGroupsAndMips[1] = AU1((AMinF1(AFloorF1(ALog2F1(AF1(resolution))), AF1(12))));
+    }
+}
+
+A_STATIC void SpdSetup(
+    outAU2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy
+    outAU2 workGroupOffset, // GPU side: pass in as constant
+    outAU2 numWorkGroupsAndMips, // GPU side: pass in as constant
+    inAU4 rectInfo // left, top, width, height
+) {
+    SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1);
+}
+#endif // #ifdef A_CPU
+//==============================================================================================================================
+//                                                     NON-PACKED VERSION
+//==============================================================================================================================
+#ifdef A_GPU
+#ifdef SPD_PACKED_ONLY
+  // Avoid compiler error
+  AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){return AF4(0.0,0.0,0.0,0.0);}
+  AF4 SpdLoad(ASU2 p, AU1 slice){return AF4(0.0,0.0,0.0,0.0);}
+  void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice){}
+  AF4 SpdLoadIntermediate(AU1 x, AU1 y){return AF4(0.0,0.0,0.0,0.0);}
+  void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){}
+  AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3){return AF4(0.0,0.0,0.0,0.0);}
+#endif // #ifdef SPD_PACKED_ONLY
+
+//_____________________________________________________________/\_______________________________________________________________
+#if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+#extension GL_KHR_shader_subgroup_quad:require
+#endif
+
+void SpdWorkgroupShuffleBarrier() {
+#ifdef A_GLSL
+    barrier();
+#endif 
+#ifdef A_HLSL
+    GroupMemoryBarrierWithGroupSync();
+#endif
+}
+
+// Only last active workgroup should proceed
+bool SpdExitWorkgroup(AU1 numWorkGroups, AU1 localInvocationIndex, AU1 slice) 
+{
+    // global atomic counter
+    if (localInvocationIndex == 0)
+    {
+        SpdIncreaseAtomicCounter(slice);
+    }
+    SpdWorkgroupShuffleBarrier();
+    return (SpdGetAtomicCounter() != (numWorkGroups - 1));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// User defined: AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3);
+
+AF4 SpdReduceQuad(AF4 v)
+{
+    #if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+    AF4 v0 = v;
+    AF4 v1 = subgroupQuadSwapHorizontal(v);
+    AF4 v2 = subgroupQuadSwapVertical(v);
+    AF4 v3 = subgroupQuadSwapDiagonal(v);
+    return SpdReduce4(v0, v1, v2, v3);
+    #elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+    // requires SM6.0
+    AU1 quad = WaveGetLaneIndex() &  (~0x3);
+    AF4 v0 = v;
+    AF4 v1 = WaveReadLaneAt(v, quad | 1);
+    AF4 v2 = WaveReadLaneAt(v, quad | 2);
+    AF4 v3 = WaveReadLaneAt(v, quad | 3);
+    return SpdReduce4(v0, v1, v2, v3);
+    /*
+    // if SM6.0 is not available, you can use the AMD shader intrinsics
+    // the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
+    // https://gpuopen.com/amd-gpu-services-ags-library/
+    // works for DX11
+    AF4 v0 = v;
+    AF4 v1;
+    v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    AF4 v2;
+    v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    AF4 v3;
+    v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    return SpdReduce4(v0, v1, v2, v3);
+    */
+    #endif
+    return v;
+}
+
+AF4 SpdReduceIntermediate(AU2 i0, AU2 i1, AU2 i2, AU2 i3)
+{
+    AF4 v0 = SpdLoadIntermediate(i0.x, i0.y);
+    AF4 v1 = SpdLoadIntermediate(i1.x, i1.y);
+    AF4 v2 = SpdLoadIntermediate(i2.x, i2.y);
+    AF4 v3 = SpdLoadIntermediate(i3.x, i3.y);
+    return SpdReduce4(v0, v1, v2, v3);
+}
+
+AF4 SpdReduceLoad4(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
+{
+    AF4 v0 = SpdLoad(ASU2(i0), slice);
+    AF4 v1 = SpdLoad(ASU2(i1), slice);
+    AF4 v2 = SpdLoad(ASU2(i2), slice);
+    AF4 v3 = SpdLoad(ASU2(i3), slice);
+    return SpdReduce4(v0, v1, v2, v3);
+}
+
+AF4 SpdReduceLoad4(AU2 base, AU1 slice)
+{
+    return SpdReduceLoad4(
+        AU2(base + AU2(0, 0)),
+        AU2(base + AU2(0, 1)), 
+        AU2(base + AU2(1, 0)), 
+        AU2(base + AU2(1, 1)),
+        slice);
+}
+
+AF4 SpdReduceLoadSourceImage4(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
+{
+    AF4 v0 = SpdLoadSourceImage(ASU2(i0), slice);
+    AF4 v1 = SpdLoadSourceImage(ASU2(i1), slice);
+    AF4 v2 = SpdLoadSourceImage(ASU2(i2), slice);
+    AF4 v3 = SpdLoadSourceImage(ASU2(i3), slice);
+    return SpdReduce4(v0, v1, v2, v3);
+}
+
+AF4 SpdReduceLoadSourceImage(AU2 base, AU1 slice)
+{
+#ifdef SPD_LINEAR_SAMPLER
+    return SpdLoadSourceImage(ASU2(base), slice);
+#else
+    return SpdReduceLoadSourceImage4(
+        AU2(base + AU2(0, 0)),
+        AU2(base + AU2(0, 1)), 
+        AU2(base + AU2(1, 0)), 
+        AU2(base + AU2(1, 1)),
+        slice);
+#endif
+}
+
+void SpdDownsampleMips_0_1_Intrinsics(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
+{
+    AF4 v[4];
+
+    ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
+    ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
+    v[0] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[0], 0, slice);
+
+    tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
+    pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
+    v[1] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[1], 0, slice);
+    
+    tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
+    pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
+    v[2] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[2], 0, slice);
+    
+    tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
+    pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
+    v[3] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[3], 0, slice);
+
+    if (mip <= 1)
+        return;
+
+    v[0] = SpdReduceQuad(v[0]);
+    v[1] = SpdReduceQuad(v[1]);
+    v[2] = SpdReduceQuad(v[2]);
+    v[3] = SpdReduceQuad(v[3]);
+
+    if ((localInvocationIndex % 4) == 0)
+    {
+        SpdStore(ASU2(workGroupID.xy * 16) + 
+            ASU2(x/2, y/2), v[0], 1, slice);
+        SpdStoreIntermediate(
+            x/2, y/2, v[0]);
+
+        SpdStore(ASU2(workGroupID.xy * 16) + 
+            ASU2(x/2 + 8, y/2), v[1], 1, slice);
+        SpdStoreIntermediate(
+            x/2 + 8, y/2, v[1]);
+
+        SpdStore(ASU2(workGroupID.xy * 16) + 
+            ASU2(x/2, y/2 + 8), v[2], 1, slice);
+        SpdStoreIntermediate(
+            x/2, y/2 + 8, v[2]);
+
+        SpdStore(ASU2(workGroupID.xy * 16) + 
+            ASU2(x/2 + 8, y/2 + 8), v[3], 1, slice);
+        SpdStoreIntermediate(
+            x/2 + 8, y/2 + 8, v[3]);
+    }
+}
+
+void SpdDownsampleMips_0_1_LDS(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice) 
+{
+    AF4 v[4];
+
+    ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
+    ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
+    v[0] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[0], 0, slice);
+
+    tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
+    pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
+    v[1] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[1], 0, slice);
+    
+    tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
+    pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
+    v[2] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[2], 0, slice);
+    
+    tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
+    pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
+    v[3] = SpdReduceLoadSourceImage(tex, slice);
+    SpdStore(pix, v[3], 0, slice);
+
+    if (mip <= 1)
+        return;
+
+    for (int i = 0; i < 4; i++)
+    {
+        SpdStoreIntermediate(x, y, v[i]);
+        SpdWorkgroupShuffleBarrier();
+        if (localInvocationIndex < 64)
+        {
+            v[i] = SpdReduceIntermediate(
+                AU2(x * 2 + 0, y * 2 + 0),
+                AU2(x * 2 + 1, y * 2 + 0),
+                AU2(x * 2 + 0, y * 2 + 1),
+                AU2(x * 2 + 1, y * 2 + 1)
+            );
+            SpdStore(ASU2(workGroupID.xy * 16) + ASU2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
+        }
+        SpdWorkgroupShuffleBarrier();
+    }
+
+    if (localInvocationIndex < 64)
+    {
+        SpdStoreIntermediate(x + 0, y + 0, v[0]);
+        SpdStoreIntermediate(x + 8, y + 0, v[1]);
+        SpdStoreIntermediate(x + 0, y + 8, v[2]);
+        SpdStoreIntermediate(x + 8, y + 8, v[3]);
+    }
+}
+
+void SpdDownsampleMips_0_1(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice) 
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice);
+#else
+    SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice);
+#endif
+}
+
+
+void SpdDownsampleMip_2(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 64)
+    {
+        AF4 v = SpdReduceIntermediate(
+            AU2(x * 2 + 0, y * 2 + 0),
+            AU2(x * 2 + 1, y * 2 + 0),
+            AU2(x * 2 + 0, y * 2 + 1),
+            AU2(x * 2 + 1, y * 2 + 1)
+        );
+        SpdStore(ASU2(workGroupID.xy * 8) + ASU2(x, y), v, mip, slice);
+        // store to LDS, try to reduce bank conflicts
+        // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+        // ...
+        // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+        SpdStoreIntermediate(x * 2 + y % 2, y * 2, v);
+    }
+#else
+    AF4 v = SpdLoadIntermediate(x, y);
+    v = SpdReduceQuad(v);
+    // quad index 0 stores result
+    if (localInvocationIndex % 4 == 0)
+    {
+        SpdStore(ASU2(workGroupID.xy * 8) + ASU2(x/2, y/2), v, mip, slice);
+        SpdStoreIntermediate(x + (y/2) % 2, y, v);
+    }
+#endif
+}
+
+void SpdDownsampleMip_3(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 16)
+    {
+        // x 0 x 0
+        // 0 0 0 0
+        // 0 x 0 x
+        // 0 0 0 0
+        AF4 v = SpdReduceIntermediate(
+            AU2(x * 4 + 0 + 0, y * 4 + 0),
+            AU2(x * 4 + 2 + 0, y * 4 + 0),
+            AU2(x * 4 + 0 + 1, y * 4 + 2),
+            AU2(x * 4 + 2 + 1, y * 4 + 2)
+        );
+        SpdStore(ASU2(workGroupID.xy * 4) + ASU2(x, y), v, mip, slice);
+        // store to LDS
+        // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
+        // ...
+        // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
+        // ...
+        // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
+        // ...
+        SpdStoreIntermediate(x * 4 + y, y * 4, v);
+    }
+#else
+    if (localInvocationIndex < 64)
+    {
+        AF4 v = SpdLoadIntermediate(x * 2 + y % 2,y * 2);
+        v = SpdReduceQuad(v);
+        // quad index 0 stores result
+        if (localInvocationIndex % 4 == 0)
+        {   
+            SpdStore(ASU2(workGroupID.xy * 4) + ASU2(x/2, y/2), v, mip, slice);
+            SpdStoreIntermediate(x * 2 + y/2, y * 2, v);
+        }
+    }
+#endif
+}
+
+void SpdDownsampleMip_4(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 4)
+    {
+        // x 0 0 0 x 0 0 0
+        // ...
+        // 0 x 0 0 0 x 0 0
+        AF4 v = SpdReduceIntermediate(
+            AU2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
+            AU2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
+            AU2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
+            AU2(x * 8 + 4 + 1 + y * 2, y * 8 + 4)
+        );
+        SpdStore(ASU2(workGroupID.xy * 2) + ASU2(x, y), v, mip, slice);
+        // store to LDS
+        // x x x x 0 ...
+        // 0 ...
+        SpdStoreIntermediate(x + y * 2, 0, v);
+    }
+#else
+    if (localInvocationIndex < 16)
+    {
+        AF4 v = SpdLoadIntermediate(x * 4 + y,y * 4);
+        v = SpdReduceQuad(v);
+        // quad index 0 stores result
+        if (localInvocationIndex % 4 == 0)
+        {   
+            SpdStore(ASU2(workGroupID.xy * 2) + ASU2(x/2, y/2), v, mip, slice);
+            SpdStoreIntermediate(x / 2 + y, 0, v);
+        }
+    }
+#endif
+}
+
+void SpdDownsampleMip_5(AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 1)
+    {
+        // x x x x 0 ...
+        // 0 ...
+        AF4 v = SpdReduceIntermediate(
+            AU2(0, 0),
+            AU2(1, 0),
+            AU2(2, 0),
+            AU2(3, 0)
+        );
+        SpdStore(ASU2(workGroupID.xy), v, mip, slice);
+    }
+#else
+    if (localInvocationIndex < 4)
+    {
+        AF4 v = SpdLoadIntermediate(localInvocationIndex,0);
+        v = SpdReduceQuad(v);
+        // quad index 0 stores result
+        if (localInvocationIndex % 4 == 0)
+        {   
+            SpdStore(ASU2(workGroupID.xy), v, mip, slice);
+        }
+    }
+#endif
+}
+
+void SpdDownsampleMips_6_7(AU1 x, AU1 y, AU1 mips, AU1 slice)
+{
+    ASU2 tex = ASU2(x * 4 + 0, y * 4 + 0);
+    ASU2 pix = ASU2(x * 2 + 0, y * 2 + 0);
+    AF4 v0 = SpdReduceLoad4(tex, slice);
+    SpdStore(pix, v0, 6, slice);
+
+    tex = ASU2(x * 4 + 2, y * 4 + 0);
+    pix = ASU2(x * 2 + 1, y * 2 + 0);
+    AF4 v1 = SpdReduceLoad4(tex, slice);
+    SpdStore(pix, v1, 6, slice);
+
+    tex = ASU2(x * 4 + 0, y * 4 + 2);
+    pix = ASU2(x * 2 + 0, y * 2 + 1);
+    AF4 v2 = SpdReduceLoad4(tex, slice);
+    SpdStore(pix, v2, 6, slice);
+
+    tex = ASU2(x * 4 + 2, y * 4 + 2);
+    pix = ASU2(x * 2 + 1, y * 2 + 1);
+    AF4 v3 = SpdReduceLoad4(tex, slice);
+    SpdStore(pix, v3, 6, slice);
+
+    if (mips <= 7) return;
+    // no barrier needed, working on values only from the same thread
+
+    AF4 v = SpdReduce4(v0, v1, v2, v3);
+    SpdStore(ASU2(x, y), v, 7, slice);
+    SpdStoreIntermediate(x, y, v);
+}
+
+void SpdDownsampleNextFour(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips, AU1 slice)
+{
+    if (mips <= baseMip) return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice);
+
+    if (mips <= baseMip + 1) return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
+
+    if (mips <= baseMip + 2) return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
+
+    if (mips <= baseMip + 3) return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3, slice);
+}
+
+void SpdDownsample(
+    AU2 workGroupID,
+    AU1 localInvocationIndex,
+    AU1 mips,
+    AU1 numWorkGroups,
+    AU1 slice
+) {
+    AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64);
+    AU1 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
+    AU1 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
+    SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice);
+
+    SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
+
+    if (mips <= 6) return;
+
+    if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice)) return;
+
+    SpdResetAtomicCounter(slice);
+
+    // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
+    SpdDownsampleMips_6_7(x, y, mips, slice);
+
+    SpdDownsampleNextFour(x, y, AU2(0,0), localInvocationIndex, 8, mips, slice);
+}
+
+void SpdDownsample(
+    AU2 workGroupID,
+    AU1 localInvocationIndex,
+    AU1 mips,
+    AU1 numWorkGroups,
+    AU1 slice,
+    AU2 workGroupOffset
+) {
+    SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//==============================================================================================================================
+//                                                       PACKED VERSION
+//==============================================================================================================================
+
+#ifdef A_HALF
+
+#ifdef A_GLSL
+#extension GL_EXT_shader_subgroup_extended_types_float16:require
+#endif
+
+AH4 SpdReduceQuadH(AH4 v)
+{
+    #if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+    AH4 v0 = v;
+    AH4 v1 = subgroupQuadSwapHorizontal(v);
+    AH4 v2 = subgroupQuadSwapVertical(v);
+    AH4 v3 = subgroupQuadSwapDiagonal(v);
+    return SpdReduce4H(v0, v1, v2, v3);
+    #elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS)
+    // requires SM6.0
+    AU1 quad = WaveGetLaneIndex() &  (~0x3);
+    AH4 v0 = v;
+    AH4 v1 = WaveReadLaneAt(v, quad | 1);
+    AH4 v2 = WaveReadLaneAt(v, quad | 2);
+    AH4 v3 = WaveReadLaneAt(v, quad | 3);
+    return SpdReduce4H(v0, v1, v2, v3);
+    /*
+    // if SM6.0 is not available, you can use the AMD shader intrinsics
+    // the AMD shader intrinsics are available in AMD GPU Services (AGS) library:
+    // https://gpuopen.com/amd-gpu-services-ags-library/
+    // works for DX11
+    AH4 v0 = v;
+    AH4 v1;
+    v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1);
+    AH4 v2;
+    v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2);
+    AH4 v3;
+    v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4);
+    return SpdReduce4H(v0, v1, v2, v3);
+    */
+    #endif
+    return AH4(0.0, 0.0, 0.0, 0.0);
+
+}
+
+AH4 SpdReduceIntermediateH(AU2 i0, AU2 i1, AU2 i2, AU2 i3)
+{
+    AH4 v0 = SpdLoadIntermediateH(i0.x, i0.y);
+    AH4 v1 = SpdLoadIntermediateH(i1.x, i1.y);
+    AH4 v2 = SpdLoadIntermediateH(i2.x, i2.y);
+    AH4 v3 = SpdLoadIntermediateH(i3.x, i3.y);
+    return SpdReduce4H(v0, v1, v2, v3);
+}
+
+AH4 SpdReduceLoad4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
+{
+    AH4 v0 = SpdLoadH(ASU2(i0), slice);
+    AH4 v1 = SpdLoadH(ASU2(i1), slice);
+    AH4 v2 = SpdLoadH(ASU2(i2), slice);
+    AH4 v3 = SpdLoadH(ASU2(i3), slice);
+    return SpdReduce4H(v0, v1, v2, v3);
+}
+
+AH4 SpdReduceLoad4H(AU2 base, AU1 slice)
+{
+    return SpdReduceLoad4H(
+        AU2(base + AU2(0, 0)),
+        AU2(base + AU2(0, 1)), 
+        AU2(base + AU2(1, 0)), 
+        AU2(base + AU2(1, 1)),
+        slice);
+}
+
+AH4 SpdReduceLoadSourceImage4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice)
+{
+    AH4 v0 = SpdLoadSourceImageH(ASU2(i0), slice);
+    AH4 v1 = SpdLoadSourceImageH(ASU2(i1), slice);
+    AH4 v2 = SpdLoadSourceImageH(ASU2(i2), slice);
+    AH4 v3 = SpdLoadSourceImageH(ASU2(i3), slice);
+    return SpdReduce4H(v0, v1, v2, v3);
+}
+
+AH4 SpdReduceLoadSourceImageH(AU2 base, AU1 slice)
+{
+#ifdef SPD_LINEAR_SAMPLER
+    return SpdLoadSourceImageH(ASU2(base), slice);
+#else
+    return SpdReduceLoadSourceImage4H(
+        AU2(base + AU2(0, 0)),
+        AU2(base + AU2(0, 1)), 
+        AU2(base + AU2(1, 0)), 
+        AU2(base + AU2(1, 1)),
+        slice);
+#endif
+}
+
+void SpdDownsampleMips_0_1_IntrinsicsH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice)
+{
+    AH4 v[4];
+
+    ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
+    ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
+    v[0] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[0], 0, slice);
+
+    tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
+    pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
+    v[1] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[1], 0, slice);
+
+    tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
+    pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
+    v[2] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[2], 0, slice);
+
+    tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
+    pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
+    v[3] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[3], 0, slice);
+
+    if (mips <= 1)
+        return;
+
+    v[0] = SpdReduceQuadH(v[0]);
+    v[1] = SpdReduceQuadH(v[1]);
+    v[2] = SpdReduceQuadH(v[2]);
+    v[3] = SpdReduceQuadH(v[3]);
+
+    if ((localInvocationIndex % 4) == 0)
+    {
+        SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2, y/2), v[0], 1, slice);
+        SpdStoreIntermediateH(x/2, y/2, v[0]);
+
+        SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2 + 8, y/2), v[1], 1, slice);
+        SpdStoreIntermediateH(x/2 + 8, y/2, v[1]);
+
+        SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2, y/2 + 8), v[2], 1, slice);
+        SpdStoreIntermediateH(x/2, y/2 + 8, v[2]);
+
+        SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2 + 8, y/2 + 8), v[3], 1, slice);
+        SpdStoreIntermediateH(x/2 + 8, y/2 + 8, v[3]);
+    }
+}
+
+void SpdDownsampleMips_0_1_LDSH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice) 
+{
+    AH4 v[4];
+
+    ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2);
+    ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y);
+    v[0] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[0], 0, slice);
+
+    tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2);
+    pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y);
+    v[1] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[1], 0, slice);
+
+    tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32);
+    pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16);
+    v[2] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[2], 0, slice);
+
+    tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32);
+    pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16);
+    v[3] = SpdReduceLoadSourceImageH(tex, slice);
+    SpdStoreH(pix, v[3], 0, slice);
+
+    if (mips <= 1)
+        return;
+
+    for (int i = 0; i < 4; i++)
+    {
+        SpdStoreIntermediateH(x, y, v[i]);
+        SpdWorkgroupShuffleBarrier();
+        if (localInvocationIndex < 64)
+        {
+            v[i] = SpdReduceIntermediateH(
+                AU2(x * 2 + 0, y * 2 + 0),
+                AU2(x * 2 + 1, y * 2 + 0),
+                AU2(x * 2 + 0, y * 2 + 1),
+                AU2(x * 2 + 1, y * 2 + 1)
+            );
+            SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice);
+        }
+        SpdWorkgroupShuffleBarrier();
+    }
+
+    if (localInvocationIndex < 64)
+    {
+        SpdStoreIntermediateH(x + 0, y + 0, v[0]);
+        SpdStoreIntermediateH(x + 8, y + 0, v[1]);
+        SpdStoreIntermediateH(x + 0, y + 8, v[2]);
+        SpdStoreIntermediateH(x + 8, y + 8, v[3]);
+    }
+}
+
+void SpdDownsampleMips_0_1H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice) 
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice);
+#else
+    SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice);
+#endif
+}
+
+
+void SpdDownsampleMip_2H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 64)
+    {
+        AH4 v = SpdReduceIntermediateH(
+            AU2(x * 2 + 0, y * 2 + 0),
+            AU2(x * 2 + 1, y * 2 + 0),
+            AU2(x * 2 + 0, y * 2 + 1),
+            AU2(x * 2 + 1, y * 2 + 1)
+        );
+        SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x, y), v, mip, slice);
+        // store to LDS, try to reduce bank conflicts
+        // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+        // ...
+        // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0
+        SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v);
+    }
+#else
+    AH4 v = SpdLoadIntermediateH(x, y);
+    v = SpdReduceQuadH(v);
+    // quad index 0 stores result
+    if (localInvocationIndex % 4 == 0)
+    {   
+        SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x/2, y/2), v, mip, slice);
+        SpdStoreIntermediateH(x + (y/2) % 2, y, v);
+    }
+#endif
+}
+
+void SpdDownsampleMip_3H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 16)
+    {
+        // x 0 x 0
+        // 0 0 0 0
+        // 0 x 0 x
+        // 0 0 0 0
+        AH4 v = SpdReduceIntermediateH(
+            AU2(x * 4 + 0 + 0, y * 4 + 0),
+            AU2(x * 4 + 2 + 0, y * 4 + 0),
+            AU2(x * 4 + 0 + 1, y * 4 + 2),
+            AU2(x * 4 + 2 + 1, y * 4 + 2)
+        );
+        SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x, y), v, mip, slice);
+        // store to LDS
+        // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0
+        // ...
+        // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0
+        // ...
+        // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x
+        // ...
+        SpdStoreIntermediateH(x * 4 + y, y * 4, v);
+    }
+#else
+    if (localInvocationIndex < 64)
+    {
+        AH4 v = SpdLoadIntermediateH(x * 2 + y % 2,y * 2);
+        v = SpdReduceQuadH(v);
+        // quad index 0 stores result
+        if (localInvocationIndex % 4 == 0)
+        {   
+            SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x/2, y/2), v, mip, slice);
+            SpdStoreIntermediateH(x * 2 + y/2, y * 2, v);
+        }
+    }
+#endif
+}
+
+void SpdDownsampleMip_4H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 4)
+    {
+        // x 0 0 0 x 0 0 0
+        // ...
+        // 0 x 0 0 0 x 0 0
+        AH4 v = SpdReduceIntermediateH(
+            AU2(x * 8 + 0 + 0 + y * 2, y * 8 + 0),
+            AU2(x * 8 + 4 + 0 + y * 2, y * 8 + 0),
+            AU2(x * 8 + 0 + 1 + y * 2, y * 8 + 4),
+            AU2(x * 8 + 4 + 1 + y * 2, y * 8 + 4)
+        );
+        SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x, y), v, mip, slice);
+        // store to LDS
+        // x x x x 0 ...
+        // 0 ...
+        SpdStoreIntermediateH(x + y * 2, 0, v);
+    }
+#else
+    if (localInvocationIndex < 16)
+    {
+        AH4 v = SpdLoadIntermediateH(x * 4 + y,y * 4);
+        v = SpdReduceQuadH(v);
+        // quad index 0 stores result
+        if (localInvocationIndex % 4 == 0)
+        {   
+            SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x/2, y/2), v, mip, slice);
+            SpdStoreIntermediateH(x / 2 + y, 0, v);
+        }
+    }
+#endif
+}
+
+void SpdDownsampleMip_5H(AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice)
+{
+#ifdef SPD_NO_WAVE_OPERATIONS
+    if (localInvocationIndex < 1)
+    {
+        // x x x x 0 ...
+        // 0 ...
+        AH4 v = SpdReduceIntermediateH(
+            AU2(0, 0),
+            AU2(1, 0),
+            AU2(2, 0),
+            AU2(3, 0)
+        );
+        SpdStoreH(ASU2(workGroupID.xy), v, mip, slice);
+    }
+#else
+    if (localInvocationIndex < 4)
+    {
+        AH4 v = SpdLoadIntermediateH(localInvocationIndex,0);
+        v = SpdReduceQuadH(v);
+        // quad index 0 stores result
+        if (localInvocationIndex % 4 == 0)
+        {   
+            SpdStoreH(ASU2(workGroupID.xy), v, mip, slice);
+        }
+    }
+#endif
+}
+
+void SpdDownsampleMips_6_7H(AU1 x, AU1 y, AU1 mips, AU1 slice)
+{
+    ASU2 tex = ASU2(x * 4 + 0, y * 4 + 0);
+    ASU2 pix = ASU2(x * 2 + 0, y * 2 + 0);
+    AH4 v0 = SpdReduceLoad4H(tex, slice);
+    SpdStoreH(pix, v0, 6, slice);
+
+    tex = ASU2(x * 4 + 2, y * 4 + 0);
+    pix = ASU2(x * 2 + 1, y * 2 + 0);
+    AH4 v1 = SpdReduceLoad4H(tex, slice);
+    SpdStoreH(pix, v1, 6, slice);
+
+    tex = ASU2(x * 4 + 0, y * 4 + 2);
+    pix = ASU2(x * 2 + 0, y * 2 + 1);
+    AH4 v2 = SpdReduceLoad4H(tex, slice);
+    SpdStoreH(pix, v2, 6, slice);
+
+    tex = ASU2(x * 4 + 2, y * 4 + 2);
+    pix = ASU2(x * 2 + 1, y * 2 + 1);
+    AH4 v3 = SpdReduceLoad4H(tex, slice);
+    SpdStoreH(pix, v3, 6, slice);
+
+    if (mips < 8) return;
+    // no barrier needed, working on values only from the same thread
+
+    AH4 v = SpdReduce4H(v0, v1, v2, v3);
+    SpdStoreH(ASU2(x, y), v, 7, slice);
+    SpdStoreIntermediateH(x, y, v);
+}
+
+void SpdDownsampleNextFourH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips, AU1 slice)
+{
+    if (mips <= baseMip) return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice);
+
+    if (mips <= baseMip + 1) return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice);
+
+    if (mips <= baseMip + 2) return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice);
+
+    if (mips <= baseMip + 3) return;
+    SpdWorkgroupShuffleBarrier();
+    SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice);
+}
+
+void SpdDownsampleH(
+    AU2 workGroupID,
+    AU1 localInvocationIndex,
+    AU1 mips,
+    AU1 numWorkGroups,
+    AU1 slice
+) {
+    AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64);
+    AU1 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2);
+    AU1 y = sub_xy.y + 8 * ((localInvocationIndex >> 7));
+
+    SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice);
+
+    SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice);
+
+    if (mips < 7) return;
+
+    if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice)) return;
+
+    SpdResetAtomicCounter(slice);
+
+    // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels.
+    SpdDownsampleMips_6_7H(x, y, mips, slice);
+
+    SpdDownsampleNextFourH(x, y, AU2(0,0), localInvocationIndex, 8, mips, slice);
+}
+
+void SpdDownsampleH(
+    AU2 workGroupID,
+    AU1 localInvocationIndex,
+    AU1 mips,
+    AU1 numWorkGroups,
+    AU1 slice,
+    AU2 workGroupOffset
+) {
+    SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice);
+}
+
+#endif // #ifdef A_HALF
+#endif // #ifdef A_GPU
\ No newline at end of file
diff --git a/bin/data/shaders/graph/cull/comp.glsl b/bin/data/shaders/graph/cull/comp.glsl
index e0eb3301..bc4041b9 100644
--- a/bin/data/shaders/graph/cull/comp.glsl
+++ b/bin/data/shaders/graph/cull/comp.glsl
@@ -5,7 +5,7 @@
 #extension GL_EXT_samplerless_texture_functions : enable
 
 layout (constant_id = 0) const uint PASSES = 6;
-layout (local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
+layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
 
 #define COMPUTE 1
 #define QUERY_MIPMAPS 1
@@ -73,106 +73,100 @@ layout (std140, binding = 3) buffer Objects {
 
 layout (binding = 4) uniform sampler2D samplerDepth;
 
+shared vec4 sharedPlanes[PASSES][6];
+
 vec4 normalizePlane( vec4 p ) {
-	return p / length(p.xyz);
-}
-
-bool frustumCull( uint id ) {
-	if ( PushConstant.passes == 0 ) return true;
-	
-	const DrawCommand drawCommand = drawCommands[id];
-	const Instance instance = instances[drawCommand.instanceID];
-	const Object object = objects[instance.objectID];
-
-	if ( drawCommand.indices == 0 || drawCommand.vertices == 0 ) return false;
-
-	bool visible = false;
-	for ( uint pass = 0; pass < PushConstant.passes; ++pass ) {
-		mat4 mat = camera.viewport[pass].projection * camera.viewport[pass].view * object.model;
-		vec4 planes[6]; {
-			for (int i = 0; i < 3; ++i)
-			for (int j = 0; j < 2; ++j) {
-				planes[i*2+j].x = mat[0][3] + (j == 0 ? mat[0][i] : -mat[0][i]);
-				planes[i*2+j].y = mat[1][3] + (j == 0 ? mat[1][i] : -mat[1][i]);
-				planes[i*2+j].z = mat[2][3] + (j == 0 ? mat[2][i] : -mat[2][i]);
-				planes[i*2+j].w = mat[3][3] + (j == 0 ? mat[3][i] : -mat[3][i]);
-				planes[i*2+j] = normalizePlane( planes[i*2+j] );
-			}
-		}
-		bool insideFrustum = true;
-			for ( uint p = 0; p < 6; ++p ) {
-					float d = max(instance.bounds.min.x * planes[p].x, instance.bounds.max.x * planes[p].x)
-									+ max(instance.bounds.min.y * planes[p].y, instance.bounds.max.y * planes[p].y)
-									+ max(instance.bounds.min.z * planes[p].z, instance.bounds.max.z * planes[p].z);
-
-					if (d < -planes[p].w) {
-							insideFrustum = false;
-							break;
-					}
-			}
-
-			if ( insideFrustum ) {
-					visible = true;
-					break;
-			}
-	}
-	return visible;
-}
-
-bool occlusionCull( uint id ) {
-	if ( PushConstant.passes == 0 ) return true;
-
-	const DrawCommand drawCommand = drawCommands[id];
-	const Instance instance = instances[drawCommand.instanceID];
-	const Object object = objects[instance.objectID];
-
-	bool visible = false;
-	for ( uint pass = 0; pass < PushConstant.passes; ++pass ) {
-		vec4 aabb;
-		vec4 sphere = aabbToSphere( instance.bounds );
-
-		float scale = length(object.model[0].xyz);
-		vec3 center = (camera.viewport[pass].view * object.model * vec4(sphere.xyz, 1)).xyz;
-		float radius = scale * sphere.w;
-
-		mat4 proj = camera.viewport[pass].projection;
-		float znear = proj[3][2];
-		float P00 = proj[0][0];
-		float P11 = proj[1][1];
-
-		if ( projectSphere( center, radius, znear, P00, P11, aabb ) ) {
-			vec2 pyramidSize = vec2(textureSize( samplerDepth, 0 ));
-
-			float width = (aabb.z - aabb.x) * pyramidSize.x;
-			float height = (aabb.w - aabb.y) * pyramidSize.y;
-
-			float level = max(0.0, floor(log2(max(width, height))));
-
-			float d1 = textureLod(samplerDepth, vec2(aabb.x, aabb.y), level).x;
-			float d2 = textureLod(samplerDepth, vec2(aabb.z, aabb.y), level).x;
-			float d3 = textureLod(samplerDepth, vec2(aabb.x, aabb.w), level).x;
-			float d4 = textureLod(samplerDepth, vec2(aabb.z, aabb.w), level).x;
-
-			float depth = min(min(d1, d2), min(d3, d4)); // min for reverse-z projection, max for standard
-			float depthSphere = znear / (center.z - radius);
-
-			if ( depthSphere >= depth - DEPTH_BIAS ) {
-				visible = true;
-				break;
-			}
-		} else {
-			visible = true;
-			break;
-		}
-	}
-	return visible;
+		return p / length(p.xyz);
 }
 
 void main() {
-	const uint gID = gl_GlobalInvocationID.x;
-	if ( !(0 <= gID && gID < drawCommands.length()) ) return;
+		const uint gID = gl_GlobalInvocationID.x;
+		const uint lID = gl_LocalInvocationIndex;
 
-	bool visible = frustumCull( gID );
-	if ( visible ) visible = occlusionCull( gID );
-	drawCommands[gID].instances = visible ? 1 : 0;
+		if ( lID == 0 ) {
+				for (uint pass = 0; pass < PushConstant.passes; ++pass) {
+						mat4 mat = camera.viewport[pass].projection * camera.viewport[pass].view;
+						for (int i = 0; i < 3; ++i)
+						for (int j = 0; j < 2; ++j) {
+							sharedPlanes[pass][i*2+j].x = mat[0][3] + (j == 0 ? mat[0][i] : -mat[0][i]);
+							sharedPlanes[pass][i*2+j].y = mat[1][3] + (j == 0 ? mat[1][i] : -mat[1][i]);
+							sharedPlanes[pass][i*2+j].z = mat[2][3] + (j == 0 ? mat[2][i] : -mat[2][i]);
+							sharedPlanes[pass][i*2+j].w = mat[3][3] + (j == 0 ? mat[3][i] : -mat[3][i]);
+							sharedPlanes[pass][i*2+j] = normalizePlane( sharedPlanes[pass][i*2+j] );
+						}
+				}
+		}
+		barrier();
+
+		if ( gID >= drawCommands.length() ) return;
+
+		const DrawCommand drawCommand = drawCommands[gID];
+		if ( drawCommand.indices == 0 || drawCommand.vertices == 0 ) return;
+
+		const Instance instance = instances[drawCommand.instanceID];
+		const Object object = objects[instance.objectID];
+
+		vec4 sphere = aabbToSphere( instance.bounds );
+		vec3 worldCenter = (object.model * vec4(sphere.xyz, 1.0)).xyz;
+
+		float scaleX = length(object.model[0].xyz);
+		float scaleY = length(object.model[1].xyz);
+		float scaleZ = length(object.model[2].xyz);
+		float maxScale = max(max(scaleX, scaleY), scaleZ);
+		float worldRadius = sphere.w * maxScale;
+
+		bool isVisible = false;
+		for ( uint pass = 0; pass < PushConstant.passes; ++pass ) {
+				bool insideFrustum = true;
+				for ( int p = 0; p < 6; ++p ) {
+						if ( dot(sharedPlanes[pass][p].xyz, worldCenter) + sharedPlanes[pass][p].w < -worldRadius ) {
+								insideFrustum = false;
+								break;
+						}
+				}
+				if ( insideFrustum ) {
+						isVisible = true;
+						break;
+				}
+		}
+
+		if ( isVisible ) {
+				isVisible = false;
+				for ( uint pass = 0; pass < PushConstant.passes; ++pass ) {
+						vec4 aabb;
+						vec3 viewCenter = ( camera.viewport[pass].view * vec4(worldCenter, 1.0) ).xyz;
+
+						mat4 proj = camera.viewport[pass].projection;
+						float znear = proj[3][2];
+						float P00 = proj[0][0];
+						float P11 = proj[1][1];
+
+						if ( projectSphere(viewCenter, worldRadius, znear, P00, P11, aabb) ) {
+								vec2 pyramidSize = vec2(textureSize( samplerDepth, 0 ));
+								float width = (aabb.z - aabb.x) * pyramidSize.x;
+								float height = (aabb.w - aabb.y) * pyramidSize.y;
+
+								float level = floor(log2(max(width, height)));
+								level = max(0.0, level);
+
+								float d1 = textureLod(samplerDepth, vec2(aabb.x, aabb.y), level).x;
+								float d2 = textureLod(samplerDepth, vec2(aabb.z, aabb.y), level).x;
+								float d3 = textureLod(samplerDepth, vec2(aabb.x, aabb.w), level).x;
+								float d4 = textureLod(samplerDepth, vec2(aabb.z, aabb.w), level).x;
+
+								float depth = min(min(d1, d2), min(d3, d4));
+								float depthSphere = znear / (viewCenter.z - worldRadius);
+
+								if ( depthSphere >= depth - DEPTH_BIAS ) {
+										isVisible = true;
+										break;
+								}
+						} else {
+								isVisible = true;
+								break;
+						}
+				}
+		}
+
+		drawCommands[gID].instances = isVisible ? 1 : 0;
 }
\ No newline at end of file
diff --git a/bin/data/shaders/raytrace/shader.ray-gen.glsl b/bin/data/shaders/raytrace/shader.ray-gen.glsl
index b9b9285b..a2608cd7 100644
--- a/bin/data/shaders/raytrace/shader.ray-gen.glsl
+++ b/bin/data/shaders/raytrace/shader.ray-gen.glsl
@@ -390,7 +390,7 @@ void main()  {
 #endif
 	{
 	#if BLOOM
-		float brightness = dot(surface.fragment.rgb, vec3(0.2126, 0.7152, 0.0722));
+		float brightness = luma(surface.fragment.rgb);
 		vec4 outFragBright = brightness > ubo.threshold ? vec4(surface.fragment.rgb, 1.0) : vec4(0, 0, 0, 1);
 	//	imageStore(outImage, ivec2(gl_LaunchIDEXT.xy), outFragBright);
 	#endif
diff --git a/engine/inc/uf/engine/graph/graph.h b/engine/inc/uf/engine/graph/graph.h
index 8a6253ba..48e14849 100644
--- a/engine/inc/uf/engine/graph/graph.h
+++ b/engine/inc/uf/engine/graph/graph.h
@@ -111,6 +111,8 @@ namespace pod {
 				uf::renderer::Buffer material;
 				uf::renderer::Buffer texture;
 				uf::renderer::Buffer light;
+
+				uf::renderer::Texture2D depthPyramid;
 			} buffers;
 		}/* storage*/;
 	};
diff --git a/engine/inc/uf/ext/vulkan/device.h b/engine/inc/uf/ext/vulkan/device.h
index 0c3472a9..39663151 100644
--- a/engine/inc/uf/ext/vulkan/device.h
+++ b/engine/inc/uf/ext/vulkan/device.h
@@ -28,6 +28,8 @@ namespace ext {
 			operator VkCommandBuffer() { return handle; }
 		};
 
+		struct Texture;
+
 		struct UF_API Device {
 			VkInstance instance;
 			VkDebugUtilsMessengerEXT debugMessenger;
@@ -86,6 +88,7 @@ namespace ext {
 				
 				uf::stl::vector<Buffer> buffers;
 				uf::stl::vector<AccelerationStructure> ass;
+				uf::stl::vector<Texture> textures;
 			} transient;
 
 			struct {
diff --git a/engine/inc/uf/ext/vulkan/graphic.h b/engine/inc/uf/ext/vulkan/graphic.h
index 01f5ac15..1a7e438b 100644
--- a/engine/inc/uf/ext/vulkan/graphic.h
+++ b/engine/inc/uf/ext/vulkan/graphic.h
@@ -42,8 +42,8 @@ namespace ext {
 			void record( const Graphic& graphic, const GraphicDescriptor& descriptor, VkCommandBuffer, size_t = 0, size_t = 0, size_t = 0 ) const;
 			void destroy();
 
-			uf::stl::vector<Shader*> getShaders( uf::stl::vector<Shader>& );
-			uf::stl::vector<const Shader*> getShaders( const uf::stl::vector<Shader>& ) const;
+			uf::stl::vector<Shader*> getShaders( uf::stl::vector<Shader>&, const uf::stl::string& = "" );
+			uf::stl::vector<const Shader*> getShaders( const uf::stl::vector<Shader>&, const uf::stl::string& = "" ) const;
 
 			void collectBuffers( const Shader& shader, const RenderMode& renderMode, const Graphic& graphic, const std::function<void(const Buffer&)>& lambda ) const;
 		};
diff --git a/engine/inc/uf/ext/vulkan/texture.h b/engine/inc/uf/ext/vulkan/texture.h
index 7abc5447..76dd7a9c 100644
--- a/engine/inc/uf/ext/vulkan/texture.h
+++ b/engine/inc/uf/ext/vulkan/texture.h
@@ -138,7 +138,7 @@ namespace ext {
 			inline void update( uf::Image& image, uint32_t layer = 1 ) { return this->update(image, this->imageLayout, layer); }
 			inline void update( void* data, VkDeviceSize size, uint32_t layer = 1 ) { return this->update(data, size, this->imageLayout, layer); }
 			
-			void generateMipmaps(VkCommandBuffer commandBuffer, uint32_t layer = 1);
+			void generateMipmaps(VkCommandBuffer commandBuffer, uint32_t layer = 0);
 
 			void fromBuffers( void* buffer, VkDeviceSize bufferSize, VkFormat format, uint32_t texWidth, uint32_t texHeight, uint32_t texDepth, uint32_t layers, VkImageUsageFlags imageUsageFlags = VK_IMAGE_USAGE_SAMPLED_BIT, VkImageLayout imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL );
 			inline void fromBuffers( void* buffer, VkDeviceSize bufferSize, VkFormat format, uint32_t texWidth, uint32_t texHeight, VkImageUsageFlags imageUsageFlags = VK_IMAGE_USAGE_SAMPLED_BIT, VkImageLayout imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL ) { return this->fromBuffers(buffer, bufferSize, format, texWidth, texHeight, 1, 1, imageUsageFlags, imageLayout); }
diff --git a/engine/inc/uf/ext/vulkan/vk.h b/engine/inc/uf/ext/vulkan/vk.h
index 0daea033..6191b2d2 100644
--- a/engine/inc/uf/ext/vulkan/vk.h
+++ b/engine/inc/uf/ext/vulkan/vk.h
@@ -22,7 +22,7 @@
 #define VK_DEFAULT_STAGE_BUFFERS ext::vulkan::settings::defaultStageBuffers
 #define VK_DEFAULT_DEFER_BUFFER_DESTROY ext::vulkan::settings::defaultDeferBufferDestroy
 #define VK_DEFAULT_COMMAND_BUFFER_IMMEDIATE ext::vulkan::settings::defaultCommandBufferImmediate
-#define VK_UBO_USE_N_BUFFERS 1
+#define VK_UBO_USE_N_BUFFERS 0
 
 namespace ext {
 	namespace vulkan {
diff --git a/engine/inc/uf/ext/vulkan/vulkan.h b/engine/inc/uf/ext/vulkan/vulkan.h
index 1cb71fe4..7b28d4d8 100644
--- a/engine/inc/uf/ext/vulkan/vulkan.h
+++ b/engine/inc/uf/ext/vulkan/vulkan.h
@@ -182,10 +182,6 @@ namespace ext {
 			extern UF_API uint32_t frameSkip;
 		}
 
-		namespace gc {
-			extern UF_API uf::stl::vector<ext::vulkan::Texture> textures;
-		}
-
 		extern UF_API Device device;
 
 		extern UF_API Allocator allocator;
diff --git a/engine/src/engine/ext/scene/behavior.cpp b/engine/src/engine/ext/scene/behavior.cpp
index ca9ebffd..73162e20 100644
--- a/engine/src/engine/ext/scene/behavior.cpp
+++ b/engine/src/engine/ext/scene/behavior.cpp
@@ -938,8 +938,9 @@ void ext::ExtSceneBehavior::bindBuffers( uf::Object& self, uf::renderer::Graphic
 #if UF_USE_VULKAN
 	// only update this when requested
 	// done outside of deserialize because the rendermode might not be initialized in time
-	if ( uf::renderer::settings::pipelines::bloom && metadata.bloom.outOfDate && graphic.material.hasShader("compute", "bloom") ) {
-		auto& shader = graphic.material.getShader("compute", "bloom");
+	if ( uf::renderer::settings::pipelines::bloom && metadata.bloom.outOfDate && graphic.material.hasShader("compute", "bloom-down") ) {
+		auto& shaderDown = graphic.material.getShader("compute", "bloom-down");
+		auto& shaderUp = graphic.material.getShader("compute", "bloom-up");
 
 		struct UniformDescriptor {
 			float threshold;
@@ -974,7 +975,12 @@ void ext::ExtSceneBehavior::bindBuffers( uf::Object& self, uf::renderer::Graphic
 		for ( auto i = 0; i < uniforms.size; ++i ) uniforms.weights[i] = tempWeights[i] / sum;
 
 		metadata.bloom.outOfDate = false;
-		if ( shader.hasUniform("UBO") ) shader.updateBuffer( (const void*) &uniforms, sizeof(uniforms), shader.getUniformBuffer("UBO") );
+		if ( shaderDown.hasUniform("UBO") ) {
+			shaderDown.updateBuffer( (const void*) &uniforms, sizeof(uniforms), shaderDown.getUniformBuffer("UBO") );
+		}
+		if ( shaderUp.hasUniform("UBO") ) {
+			shaderUp.updateBuffer( (const void*) &uniforms, sizeof(uniforms), shaderUp.getUniformBuffer("UBO") );
+		}
 	}
 
 	struct UniformDescriptor {
diff --git a/engine/src/engine/graph/graph.cpp b/engine/src/engine/graph/graph.cpp
index 8f5c8c67..6771d01f 100644
--- a/engine/src/engine/graph/graph.cpp
+++ b/engine/src/engine/graph/graph.cpp
@@ -196,7 +196,6 @@ namespace {
 
 			// compute shader
 			auto& shader = graphic.material.getShader("compute", uf::renderer::settings::pipelines::names::culling);
-			shader.aliasAttachment("depthPyramid");
 		}
 		// vxgi pipeline
 		if ( uf::renderer::settings::pipelines::vxgi ) {
@@ -479,6 +478,9 @@ namespace {
 			shader.aliasBuffer( "indirect", *indirect );
 			shader.aliasBuffer( "instance", storage.buffers.instance );
 			shader.aliasBuffer( "object", storage.buffers.object );
+
+			shader.textures.clear();
+			shader.textures.emplace_back().aliasTexture( storage.buffers.depthPyramid );
 		}
 
 		// vxgi pipeline
@@ -1532,10 +1534,12 @@ void uf::graph::destroy( uf::Object& object, bool soft ) {
 void uf::graph::destroy( pod::Graph::Storage& storage, bool soft ) {
 	soft = false;
 #if UF_USE_VULKAN
+/*
 	for ( auto& texture : uf::renderer::gc::textures ) {
 		texture.destroy( false );
 	}
 	uf::renderer::gc::textures.clear();
+*/
 #endif
 
 	// cleanup graphic handles
diff --git a/engine/src/ext/vulkan/graphic.cpp b/engine/src/ext/vulkan/graphic.cpp
index 8388fffb..e91821f4 100644
--- a/engine/src/ext/vulkan/graphic.cpp
+++ b/engine/src/ext/vulkan/graphic.cpp
@@ -41,7 +41,7 @@ void ext::vulkan::Pipeline::initialize( const Graphic& graphic, const GraphicDes
 	this->metadata.type = descriptor.pipeline;
 	Device& device = *graphic.device;
 
-	auto shaders = getShaders( graphic.material.shaders );
+	auto shaders = getShaders( graphic.material.shaders, descriptor.pipeline );
 	assert( shaders.size() > 0 );
 
 	uint32_t subpass = descriptor.subpass;
@@ -397,7 +397,10 @@ void ext::vulkan::Pipeline::record( const Graphic& graphic, VkCommandBuffer comm
 	return record( graphic, descriptor, commandBuffer, pass, draw, offset );
 }
 void ext::vulkan::Pipeline::record( const Graphic& graphic, const GraphicDescriptor& descriptor, VkCommandBuffer commandBuffer, size_t pass, size_t draw, size_t offset ) const {
-	auto shaders = getShaders( graphic.material.shaders );
+	auto shaders = getShaders( graphic.material.shaders, descriptor.pipeline );
+	for ( auto i = 0; i < shaders.size(); ++i ) {
+	//	UF_MSG_DEBUG("{} | {}: {}", descriptor.pipeline, i, shaders[i]->filename);
+	}
 
 	// create dynamic offset ranges
 	static thread_local uf::stl::vector<uint32_t> dynamicOffsets;
@@ -427,6 +430,7 @@ void ext::vulkan::Pipeline::record( const Graphic& graphic, const GraphicDescrip
 			else continue;
 		}
 
+		// automatically bind to our default push constants
 		if ( shader->metadata.definitions.pushConstants.count("PushConstant") > 0 ) {
 			struct PushConstant {
 				uint32_t pass;
@@ -450,7 +454,10 @@ void ext::vulkan::Pipeline::record( const Graphic& graphic, const GraphicDescrip
 	}
 
 	// no matching bind point for shaders, skip
-	if ( !bound ) return;
+	if ( !bound ) {
+		UF_MSG_DEBUG("No shaders found to bind...");
+		return;
+	}
 
 	// Bind descriptor sets describing shader binding points
 #if VK_UBO_USE_N_BUFFERS
@@ -506,7 +513,7 @@ void ext::vulkan::Pipeline::update( const Graphic& graphic, const GraphicDescrip
 	RenderMode& renderMode = ext::vulkan::getRenderMode(descriptor.renderMode, true);
 	auto& renderTarget = renderMode.getRenderTarget(/*descriptor.renderTarget*/);
 
-	auto shaders = getShaders( graphic.material.shaders );
+	auto shaders = getShaders( graphic.material.shaders, descriptor.pipeline );
 	uf::stl::vector<VkWriteDescriptorSet> writeDescriptorSets;
 	uf::stl::vector<uf::renderer::AccelerationStructure> tlases;
 
@@ -947,32 +954,32 @@ void ext::vulkan::Pipeline::destroy() {
 	}
 */
 }
-uf::stl::vector<ext::vulkan::Shader*> ext::vulkan::Pipeline::getShaders( uf::stl::vector<ext::vulkan::Shader>& shaders ) {
+uf::stl::vector<ext::vulkan::Shader*> ext::vulkan::Pipeline::getShaders( uf::stl::vector<ext::vulkan::Shader>& shaders, const uf::stl::string& type ) {
 	uf::stl::unordered_map<uf::stl::string, ext::vulkan::Shader*> map;
 	uf::stl::vector<ext::vulkan::Shader*> res;
 	bool isCompute = false;
 	for ( auto& shader : shaders ) {
-		if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != metadata.type ) continue;
+		if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != (type == "" ? metadata.type : type) ) continue;
 		if ( shader.descriptor.stage == VK_SHADER_STAGE_COMPUTE_BIT ) isCompute = true;
 	}
 	for ( auto& shader : shaders ) {
-		if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != metadata.type ) continue;
+		if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != (type == "" ? metadata.type : type) ) continue;
 		if ( isCompute && shader.descriptor.stage != VK_SHADER_STAGE_COMPUTE_BIT ) continue;
 		map[shader.metadata.type] = &shader;
 	}
 	for ( auto pair : map ) res.insert( res.begin(), pair.second);
 	return res;
 }
-uf::stl::vector<const ext::vulkan::Shader*> ext::vulkan::Pipeline::getShaders( const uf::stl::vector<ext::vulkan::Shader>& shaders ) const {
+uf::stl::vector<const ext::vulkan::Shader*> ext::vulkan::Pipeline::getShaders( const uf::stl::vector<ext::vulkan::Shader>& shaders, const uf::stl::string& type ) const {
 	uf::stl::unordered_map<uf::stl::string, const ext::vulkan::Shader*> map;
 	uf::stl::vector<const ext::vulkan::Shader*> res;
 	bool isCompute = false;
 	for ( auto& shader : shaders ) {
-		if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != metadata.type ) continue;
+		if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != (type == "" ? metadata.type : type) ) continue;
 		if ( shader.descriptor.stage == VK_SHADER_STAGE_COMPUTE_BIT ) isCompute = true;
 	}
 	for ( auto& shader : shaders ) {
-		if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != metadata.type ) continue;
+		if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != (type == "" ? metadata.type : type) ) continue;
 		if ( isCompute && shader.descriptor.stage != VK_SHADER_STAGE_COMPUTE_BIT ) continue;
 		map[shader.metadata.type] = &shader;
 	}
@@ -1839,19 +1846,19 @@ void ext::vulkan::Graphic::record( VkCommandBuffer commandBuffer, size_t pass, s
 void ext::vulkan::Graphic::record( VkCommandBuffer commandBuffer, const GraphicDescriptor& descriptor, size_t pass, size_t draw, size_t offset ) const {
 	if ( !process ) return;
 	if ( !this->hasPipeline( descriptor ) ) {
-		VK_DEBUG_VALIDATION_MESSAGE(this << ": has no valid pipeline ({} {})", descriptor.renderMode, descriptor.renderTarget);
+		//UF_MSG_DEBUG("{} has no valid pipeline ({}:{}:{})", (void*) this, descriptor.renderMode, descriptor.renderTarget, descriptor.pipeline);
 		return;
 	}
 
 	auto& pipeline = this->getPipeline( descriptor );
 	if ( pipeline.descriptorSet == VK_NULL_HANDLE ) {
-		VK_DEBUG_VALIDATION_MESSAGE(this << ": has no valid pipeline descriptor set ({} {})", descriptor.renderMode, descriptor.renderTarget);
+		//UF_MSG_DEBUG("{} has no valid pipeline descriptor set ({}:{}:{})", (void*) this, descriptor.renderMode, descriptor.renderTarget, descriptor.pipeline);
 		return;
 	}
 	if ( !pipeline.metadata.process ) return;
 	pipeline.record(*this, descriptor, commandBuffer, pass, draw, offset);
 
-	auto shaders = pipeline.getShaders( material.shaders );
+	auto shaders = pipeline.getShaders( material.shaders, descriptor.pipeline );
 	for ( auto* shader : shaders ) {
 		if ( shader->descriptor.stage == VK_SHADER_STAGE_COMPUTE_BIT ) return;
 		if (
diff --git a/engine/src/ext/vulkan/rendermodes/base.cpp b/engine/src/ext/vulkan/rendermodes/base.cpp
index 6b992d49..7c3a6726 100644
--- a/engine/src/ext/vulkan/rendermodes/base.cpp
+++ b/engine/src/ext/vulkan/rendermodes/base.cpp
@@ -273,8 +273,8 @@ void ext::vulkan::BaseRenderMode::initialize( Device& device ) {
 	// swapchain.destroy();
 	swapchain.initialize( device );
 	// bind swapchain images
-	images.resize( ext::vulkan::swapchain.buffers );
-	VK_CHECK_RESULT(vkGetSwapchainImagesKHR( device, swapchain.swapChain, &swapchain.buffers, images.data()));
+	::images.resize( ext::vulkan::swapchain.buffers );
+	VK_CHECK_RESULT(vkGetSwapchainImagesKHR( device, swapchain.swapChain, &swapchain.buffers, ::images.data()));
 	// create image views for swapchain images
 
 	renderTarget.attachments.clear();
@@ -302,7 +302,7 @@ void ext::vulkan::BaseRenderMode::initialize( Device& device ) {
 		colorAttachmentView.subresourceRange.layerCount = 1;
 		colorAttachmentView.viewType = VK_IMAGE_VIEW_TYPE_2D;
 		colorAttachmentView.flags = 0;
-		colorAttachmentView.image = images[frame];
+		colorAttachmentView.image = ::images[frame];
 
 		VK_CHECK_RESULT(vkCreateImageView( device, &colorAttachmentView, nullptr, &renderTarget.attachments[frame].view));
 		VK_REGISTER_HANDLE( renderTarget.attachments[frame].view );
@@ -312,7 +312,7 @@ void ext::vulkan::BaseRenderMode::initialize( Device& device ) {
 		renderTarget.attachments[frame].descriptor.layout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
 		renderTarget.attachments[frame].descriptor.usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
 		renderTarget.attachments[frame].descriptor.aliased = true;
-		renderTarget.attachments[frame].image = images[frame];
+		renderTarget.attachments[frame].image = ::images[frame];
 		renderTarget.attachments[frame].mem = VK_NULL_HANDLE;
 
 		metadata.attachments["color["+std::to_string((int) frame)+"]"] = attachmentIndex++;
@@ -530,7 +530,7 @@ void ext::vulkan::BaseRenderMode::initialize( Device& device ) {
 	// Create framebuffer
 	{	
 		// Create a frame buffer for every image in the swapchain
-		renderTarget.framebuffers.resize(images.size());
+		renderTarget.framebuffers.resize(::images.size());
 		for (size_t frame = 0; frame < renderTarget.framebuffers.size(); frame++)
 		{
 			std::array<VkImageView, 2> attachments;										
@@ -555,7 +555,7 @@ void ext::vulkan::BaseRenderMode::initialize( Device& device ) {
 #if 0
 	if ( true ) {
 		auto commandBuffer = device.fetchCommandBuffer(uf::renderer::QueueEnum::TRANSFER);
-		for ( size_t frame = 0; frame < images.size(); ++frame ) {
+		for ( size_t frame = 0; frame < ::images.size(); ++frame ) {
 			VkImageMemoryBarrier imageMemoryBarrier = {};
 			imageMemoryBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
 			imageMemoryBarrier.srcAccessMask = 0;
@@ -652,10 +652,11 @@ void ext::vulkan::BaseRenderMode::destroy() {
 	}
 
 	for ( auto& image : ::images ) {
-	//	vkDestroyImage( *device, image, nullptr );
+		// vkDestroyImage( *device, image, nullptr ); // destroyed via vkDestroySwapchainKHR
 		VK_UNREGISTER_HANDLE( image );
 		image = VK_NULL_HANDLE;
 	}
+	::images.clear();
 	
 	ext::vulkan::RenderMode::destroy();
 
diff --git a/engine/src/ext/vulkan/rendermodes/deferred.cpp b/engine/src/ext/vulkan/rendermodes/deferred.cpp
index d77a78d0..d4a6528f 100644
--- a/engine/src/ext/vulkan/rendermodes/deferred.cpp
+++ b/engine/src/ext/vulkan/rendermodes/deferred.cpp
@@ -26,22 +26,34 @@
 
 namespace {
 	const uf::stl::string DEFERRED_MODE = "compute";
-	ext::vulkan::Texture depthPyramid;
+
 	uf::stl::vector<VkImageView> depthPyramidViews;
-
-	void cmdImageBarrier(VkCommandBuffer commandBuffer, VkImage image, VkAccessFlags srcAccess, VkAccessFlags dstAccess, VkImageLayout oldLayout, VkImageLayout newLayout) {
-		VkImageMemoryBarrier barrier{VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER};
-		barrier.srcAccessMask = srcAccess;
-		barrier.dstAccessMask = dstAccess;
-		barrier.oldLayout = oldLayout;
-		barrier.newLayout = newLayout;
-		barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-		barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-		barrier.image = image;
-		barrier.subresourceRange = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 };
-
-		vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, NULL, 0, NULL, 1, &barrier);
+	uf::stl::vector<VkImageView> bloomViews;
+	
+	ext::vulkan::Buffer atomicCounterBloom;
+	ext::vulkan::Buffer atomicCounterDepth;
+	
+	struct AtomicCounter {
+		uint32_t counter;
 	};
+	struct PushConstants {
+		uint32_t mips;
+		uint32_t numWorkGroups;
+		uint32_t workGroupOffset;
+	};
+
+	void destroyImageView( ext::vulkan::RenderMode* self, VkImageView view ) {
+		ext::vulkan::mutex.lock();
+		auto& texture = self->device->transient.textures.emplace_back();
+		ext::vulkan::mutex.unlock();
+
+		texture.device = self->device;
+		texture.view = view;
+	/*
+		vkDestroyImageView(self->device.logicalDevice, view, nullptr);
+		VK_UNREGISTER_HANDLE(view);
+	*/
+	}
 }
 
 #include "./transition.inl"
@@ -64,8 +76,7 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) {
 
 	struct {
 		size_t id, bary, depth, uv, normal;
-		size_t color, bright, motion, scratch, output;
-		size_t depthPyramid;
+		size_t color, bright, motion, output;
 	} attachments = {};
 
 	bool blend = true; // !ext::vulkan::settings::invariant::deferredSampling;
@@ -108,7 +119,7 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) {
 	attachments.depth = renderTarget.attach(RenderTarget::Attachment::Descriptor{
 		/*.format = */ext::vulkan::settings::formats::depth,
 		/*.layout = */VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,
-		/*.usage = */VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+		/*.usage = */VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT,
 		/*.blend = */false,
 		/*.samples = */msaa,
 		//*.mips = */1,
@@ -127,13 +138,7 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) {
 		/*.usage =*/ VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT,
 		/*.blend =*/ blend,
 		/*.samples =*/ 1,
-	});
-	attachments.scratch = renderTarget.attach(RenderTarget::Attachment::Descriptor{
-		/*.format =*/ ext::vulkan::settings::pipelines::hdr ? enums::Format::HDR : enums::Format::SDR,
-		/*.layout = */ VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
-		/*.usage =*/ VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT,
-		/*.blend =*/ blend,
-		/*.samples =*/ 1,
+		/*.mips =*/ mips,
 	});
 	attachments.motion = renderTarget.attach(RenderTarget::Attachment::Descriptor{
 	//	/*.format = */VK_FORMAT_R32G32B32A32_SFLOAT,
@@ -143,14 +148,6 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) {
 		/*.blend = */false,
 		/*.samples = */1,
 	});
-	attachments.depthPyramid = renderTarget.attach(RenderTarget::Attachment::Descriptor{
-		/*.format = */VK_FORMAT_R32_SFLOAT,
-		/*.layout = */ VK_IMAGE_LAYOUT_GENERAL,
-		/*.usage = */ VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT,
-		/*.blend = */false,
-		/*.samples = */1,
-		/*.mips = */mips,
-	});
 
 	metadata.attachments["id"] = attachments.id;
 
@@ -164,10 +161,8 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) {
 #endif
 	
 	metadata.attachments["depth"] = attachments.depth;
-	metadata.attachments["depthPyramid"] = attachments.depthPyramid;
 	metadata.attachments["color"] = attachments.color;
 	metadata.attachments["bright"] = attachments.bright;
-	metadata.attachments["scratch"] = attachments.scratch;
 	metadata.attachments["motion"] = attachments.motion;
 
 	metadata.attachments["output"] = attachments.color;
@@ -339,13 +334,67 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) {
 		}
 		
 		if ( settings::pipelines::bloom ) {
-			uf::stl::string computeShaderFilename = uf::io::resolveURI(uf::io::root+"/shaders/display/bloom/comp.spv");
-			blitter.material.attachShader(computeShaderFilename, uf::renderer::enums::Shader::COMPUTE, "bloom");
+			uf::stl::string computeShaderFilename = uf::io::resolveURI(uf::io::root+"/shaders/display/bloom/up.comp.spv");
+			blitter.material.attachShader(computeShaderFilename, uf::renderer::enums::Shader::COMPUTE, "bloom-up");
+
+			auto& shader = blitter.material.getShader("compute", "bloom-up");
 
-			auto& shader = blitter.material.getShader("compute", "bloom");
 			shader.aliasAttachment("color", this, VK_IMAGE_LAYOUT_GENERAL);
 			shader.aliasAttachment("bright", this, VK_IMAGE_LAYOUT_GENERAL);
-			shader.aliasAttachment("scratch", this, VK_IMAGE_LAYOUT_GENERAL);
+		}
+
+		if ( settings::pipelines::bloom ) {
+			uf::stl::string computeShaderFilename = uf::io::resolveURI(uf::io::root+"/shaders/display/bloom/down.comp.spv");
+			blitter.material.attachShader(computeShaderFilename, uf::renderer::enums::Shader::COMPUTE, "bloom-down");
+
+			auto& shader = blitter.material.getShader("compute", "bloom-down");
+			auto mips = uf::vector::mips( pod::Vector2ui{ width, height } );
+
+			shader.aliasAttachment("color", this, VK_IMAGE_LAYOUT_GENERAL);
+			shader.aliasAttachment("bright", this, VK_IMAGE_LAYOUT_GENERAL);
+
+			shader.setSpecializationConstants({
+				{ "MIPS", mips },
+			});
+			shader.setDescriptorCounts({
+				{ "outImage", mips },
+			});
+
+			// atomic counter buffer
+			::atomicCounterBloom.initialize( (const void*) nullptr, sizeof(::AtomicCounter) * 1, uf::renderer::enums::Buffer::STORAGE );
+			shader.aliasBuffer("atomicCounterBloom", ::atomicCounterBloom);
+
+			for ( auto& view : ::bloomViews ) ::destroyImageView( this, view );
+			::bloomViews.clear();
+			::bloomViews.resize(mips);
+			shader.textures.clear();
+			
+			ext::vulkan::Texture2D source; source.aliasAttachment( this->getAttachment("bright") );
+			for ( auto i = 0; i < mips; ++i ) {
+				auto& view = ::bloomViews[i];
+				VkImageViewCreateInfo viewCreateInfo = {};
+				viewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+				viewCreateInfo.pNext = NULL;
+				viewCreateInfo.components = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A };
+				viewCreateInfo.subresourceRange.baseMipLevel = i;
+				viewCreateInfo.subresourceRange.layerCount = 1;
+				viewCreateInfo.subresourceRange.levelCount = 1;
+				viewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+				viewCreateInfo.viewType = source.viewType;
+				viewCreateInfo.format = source.format;
+				viewCreateInfo.image = source.image;
+
+				VK_CHECK_RESULT(vkCreateImageView(device.logicalDevice, &viewCreateInfo, nullptr, &view));
+				VK_REGISTER_HANDLE(view);
+
+				{
+					auto& texture = shader.textures.emplace_back();
+					texture.aliasTexture( source );
+					texture.view = view;
+					texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
+					texture.updateDescriptors();
+				}
+			}
 		}
 
 		if ( settings::pipelines::culling ) {
@@ -354,27 +403,31 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) {
 
 			auto& shader = blitter.material.getShader("compute", "depth-pyramid");
 			auto mips = uf::vector::mips( pod::Vector2ui{ width, height } );
+			// depth pyramid
+			shader.aliasAttachment("depth", this);
+
 			shader.setSpecializationConstants({
 				{ "MIPS", mips },
 			});
 			shader.setDescriptorCounts({
-				{ "inImage", mips },
 				{ "outImage", mips },
 			});
 
-			shader.aliasAttachment("depth", this);
+			// atomic counter buffer
+			::atomicCounterDepth.initialize( (const void*) nullptr, sizeof(::AtomicCounter) * 1, uf::renderer::enums::Buffer::STORAGE );
+			shader.aliasBuffer("atomicCounterDepth", ::atomicCounterDepth);
 
-			ext::vulkan::Texture2D source; source.aliasAttachment( this->getAttachment("depthPyramid") );
-			source.sampler.descriptor.reduction.enabled = true;
-			source.sampler.descriptor.reduction.mode = VK_SAMPLER_REDUCTION_MODE_MIN;
-			
-			for ( auto& view : ::depthPyramidViews ) {
-				vkDestroyImageView(device.logicalDevice, view, nullptr);
-				VK_UNREGISTER_HANDLE(view);
-			}
+			for ( auto& view : ::depthPyramidViews ) ::destroyImageView( this, view );
 			::depthPyramidViews.clear();
 			::depthPyramidViews.resize(mips);
 			shader.textures.clear();
+
+			storage.buffers.depthPyramid.destroy(true);
+			storage.buffers.depthPyramid.fromBuffers( NULL, 0, VK_FORMAT_R32_SFLOAT, width, height, 1, 1, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT, VK_IMAGE_LAYOUT_GENERAL );
+
+			ext::vulkan::Texture2D& source = storage.buffers.depthPyramid;
+			source.sampler.descriptor.reduction.enabled = true;
+			source.sampler.descriptor.reduction.mode = VK_SAMPLER_REDUCTION_MODE_MIN;
 			
 			for ( auto i = 0; i < mips; ++i ) {
 				auto& view = ::depthPyramidViews[i];
@@ -392,22 +445,14 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) {
 
 				VK_CHECK_RESULT(vkCreateImageView(device.logicalDevice, &viewCreateInfo, nullptr, &view));
 				VK_REGISTER_HANDLE(view);
-			}
 
-			for ( auto i = 0; i < mips; ++i ) {
-				auto& texture = shader.textures.emplace_back();
-				texture.aliasTexture( source );
-				texture.view = ::depthPyramidViews[i];
-				texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
-				texture.updateDescriptors();
-			}
-			
-			for ( auto i = 0; i < mips; ++i ) {
-				auto& texture = shader.textures.emplace_back();
-				texture.aliasTexture( source );
-				texture.view = ::depthPyramidViews[i];
-				texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
-				texture.updateDescriptors();
+				{
+					auto& texture = shader.textures.emplace_back();
+					texture.aliasTexture( source );
+					texture.view = view;
+					texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
+					texture.updateDescriptors();
+				}
 			}
 		}
 	
@@ -437,7 +482,18 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) {
 			}
 
 			if ( settings::pipelines::bloom ) {
-				descriptor.pipeline = "bloom";
+				descriptor.aux = uf::vector::mips( pod::Vector2ui{ width, height } );
+				descriptor.pipeline = "bloom-down";
+				descriptor.subpass = 0;
+				descriptor.bind.point = VK_PIPELINE_BIND_POINT_COMPUTE;
+				if ( !blitter.hasPipeline( descriptor ) ) {
+					blitter.initializePipeline( descriptor );
+				}
+			}
+
+			if ( settings::pipelines::bloom ) {
+				descriptor.aux = {};
+				descriptor.pipeline = "bloom-up";
 				descriptor.subpass = 0;
 				descriptor.bind.point = VK_PIPELINE_BIND_POINT_COMPUTE;
 				if ( !blitter.hasPipeline( descriptor ) ) {
@@ -474,6 +530,49 @@ void ext::vulkan::DeferredRenderMode::tick() {
 		rebuild = true;
 		renderTarget.initialize( *renderTarget.device );
 
+		if ( settings::pipelines::bloom ) {
+			auto& shader = blitter.material.getShader("compute", "bloom-down");
+			auto mips = uf::vector::mips( pod::Vector2ui{ width, height } );
+			shader.setSpecializationConstants({
+				{ "MIPS", mips },
+			});
+			shader.setDescriptorCounts({
+				{ "outImage", mips },
+			});
+			
+			for ( auto& view : ::bloomViews ) ::destroyImageView( this, view );
+			::bloomViews.clear();
+			::bloomViews.resize(mips);
+			shader.textures.clear();
+			
+			ext::vulkan::Texture2D source; source.aliasAttachment( this->getAttachment("bright") );
+			for ( auto i = 0; i < mips; ++i ) {
+				auto& view = ::bloomViews[i];
+				VkImageViewCreateInfo viewCreateInfo = {};
+				viewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+				viewCreateInfo.pNext = NULL;
+				viewCreateInfo.components = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A };
+				viewCreateInfo.subresourceRange.baseMipLevel = i;
+				viewCreateInfo.subresourceRange.layerCount = 1;
+				viewCreateInfo.subresourceRange.levelCount = 1;
+				viewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+				viewCreateInfo.viewType = source.viewType;
+				viewCreateInfo.format = source.format;
+				viewCreateInfo.image = source.image;
+
+				VK_CHECK_RESULT(vkCreateImageView(device->logicalDevice, &viewCreateInfo, nullptr, &view));
+				VK_REGISTER_HANDLE(view);
+
+				{
+					auto& texture = shader.textures.emplace_back();
+					texture.aliasTexture( source );
+					texture.view = view;
+					texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
+					texture.updateDescriptors();
+				}
+			}
+		}
+
 		if ( settings::pipelines::culling ) {
 			auto& shader = blitter.material.getShader("compute", "depth-pyramid");
 			auto mips = uf::vector::mips( pod::Vector2ui{ width, height } );
@@ -481,20 +580,17 @@ void ext::vulkan::DeferredRenderMode::tick() {
 				{ "MIPS", mips },
 			});
 			shader.setDescriptorCounts({
-				{ "inImage", mips },
 				{ "outImage", mips },
 			});
 
-			shader.aliasAttachment("depth", this);
+			storage.buffers.depthPyramid.destroy(true);
+			storage.buffers.depthPyramid.fromBuffers( NULL, 0, VK_FORMAT_R32_SFLOAT, width, height, 1, 1, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT, VK_IMAGE_LAYOUT_GENERAL );
 
-			ext::vulkan::Texture2D source; source.aliasAttachment( this->getAttachment("depthPyramid") );
+			ext::vulkan::Texture2D& source = storage.buffers.depthPyramid;
 			source.sampler.descriptor.reduction.enabled = true;
 			source.sampler.descriptor.reduction.mode = VK_SAMPLER_REDUCTION_MODE_MIN;
 			
-			for ( auto& view : ::depthPyramidViews ) {
-				vkDestroyImageView(device->logicalDevice, view, nullptr);
-				VK_UNREGISTER_HANDLE(view);
-			}
+			for ( auto& view : ::depthPyramidViews ) ::destroyImageView( this, view );
 			::depthPyramidViews.clear();
 			::depthPyramidViews.resize(mips);
 			shader.textures.clear();
@@ -515,23 +611,14 @@ void ext::vulkan::DeferredRenderMode::tick() {
 
 				VK_CHECK_RESULT(vkCreateImageView(device->logicalDevice, &viewCreateInfo, nullptr, &view));
 				VK_REGISTER_HANDLE(view);
-				
-			}
 
-			for ( auto i = 0; i < mips; ++i ) {
-				auto& texture = shader.textures.emplace_back();
-				texture.aliasTexture( source );
-				texture.view = ::depthPyramidViews[i];
-				texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
-				texture.updateDescriptors();
-			}
-			
-			for ( auto i = 0; i < mips; ++i ) {
-				auto& texture = shader.textures.emplace_back();
-				texture.aliasTexture( source );
-				texture.view = ::depthPyramidViews[i];
-				texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
-				texture.updateDescriptors();
+				{
+					auto& texture = shader.textures.emplace_back();
+					texture.aliasTexture( source );
+					texture.view = view;
+					texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL;
+					texture.updateDescriptors();
+				}
 			}
 		}
 	}
@@ -582,7 +669,20 @@ void ext::vulkan::DeferredRenderMode::tick() {
 			}
 
 			if ( settings::pipelines::bloom ) {
-				descriptor.pipeline = "bloom";
+				descriptor.aux = uf::vector::mips( pod::Vector2ui{ width, height } );
+				descriptor.pipeline = "bloom-down";
+				descriptor.subpass = 0;
+				descriptor.bind.point = VK_PIPELINE_BIND_POINT_COMPUTE;
+				if ( blitter.hasPipeline( descriptor ) ) {
+					blitter.getPipeline( descriptor ).update( blitter, descriptor );
+				} else {
+					blitter.initializePipeline( descriptor );
+				}
+			}
+
+			if ( settings::pipelines::bloom ) {
+				descriptor.aux = {};
+				descriptor.pipeline = "bloom-up";
 				descriptor.subpass = 0;
 				descriptor.bind.point = VK_PIPELINE_BIND_POINT_COMPUTE;
 				if ( blitter.hasPipeline( descriptor ) ) {
@@ -659,6 +759,21 @@ void ext::vulkan::DeferredRenderMode::render() {
 	//unlockMutex( this->mostRecentCommandPoolId );
 }
 void ext::vulkan::DeferredRenderMode::destroy() {
+	// cleanup
+	::atomicCounterDepth.destroy(false);
+	::atomicCounterBloom.destroy(false);
+	
+	for ( auto& view : ::bloomViews ) {
+		vkDestroyImageView(device->logicalDevice, view, nullptr);
+		VK_UNREGISTER_HANDLE(view);
+	}
+	::bloomViews.clear();
+	for ( auto& view : ::depthPyramidViews ) {
+		vkDestroyImageView(device->logicalDevice, view, nullptr);
+		VK_UNREGISTER_HANDLE(view);
+	}
+	::depthPyramidViews.clear();
+	
 	ext::vulkan::RenderMode::destroy();
 }
 
@@ -744,18 +859,6 @@ void ext::vulkan::DeferredRenderMode::createCommandBuffers( const uf::stl::vecto
 			
 			size_t currentSubpass = 0;
 
-		/*
-			// transition layers for read
-			for ( auto layer : layers ) {
-				layer->pipelineBarrier( commandBuffer, 0 );
-			}
-		*/
-		// VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL
-		// VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
-
-		// VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
-		// VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL
-
 		#if 1
 			for ( auto& attachment : renderTarget.attachments ) {
 				// transition attachments to general attachments for imageStore
@@ -863,11 +966,61 @@ void ext::vulkan::DeferredRenderMode::createCommandBuffers( const uf::stl::vecto
 				::transitionAttachmentsFrom( this, shader, commandBuffer );
 			}
 
-			if ( settings::pipelines::bloom && blitter.material.hasShader("compute", "bloom") ) {
-				auto& shader = blitter.material.getShader("compute", "bloom");
+			if ( settings::pipelines::bloom && blitter.material.hasShader("compute", "bloom-down") ) {
+				auto& shader = blitter.material.getShader("compute", "bloom-down");
+				auto mips = uf::vector::mips( pod::Vector2ui{ width, height } );
+
+				uint32_t dispatchX = (width + 63) / 64;
+				uint32_t dispatchY = (height + 63) / 64;
+				uint32_t numWorkGroups = dispatchX * dispatchY;
+				auto& pushConstant = shader.pushConstants.front().get<::PushConstants>();
+				pushConstant = {
+					.mips = mips,
+					.numWorkGroups = numWorkGroups,
+					.workGroupOffset = 0,
+				};
+
 				ext::vulkan::GraphicDescriptor descriptor = blitter.descriptor;
 				descriptor.renderMode = "";
-				descriptor.pipeline = "bloom";
+				descriptor.aux = mips;
+				descriptor.pipeline = "bloom-down";
+				descriptor.bind.width = dispatchX * 256;
+				descriptor.bind.height = dispatchY;
+				descriptor.bind.depth = metadata.eyes;
+				descriptor.bind.point = VK_PIPELINE_BIND_POINT_COMPUTE;
+				descriptor.subpass = 0;
+
+				// reset counter buffer
+				vkCmdFillBuffer(commandBuffer, ::atomicCounterBloom.buffer, 0, 4, 0);
+				VkMemoryBarrier counterBarrier{VK_STRUCTURE_TYPE_MEMORY_BARRIER};
+				counterBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+				counterBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+				vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &counterBarrier, 0, nullptr, 0, nullptr);
+
+				// transition attachments to general attachments for imageStore
+				device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "setImageLayout" );
+				::transitionAttachmentsTo( this, shader, commandBuffer );
+
+				// dispatch compute shader				
+				device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "bloom[down]" );
+				blitter.record( commandBuffer, descriptor );
+			
+			/*
+				ext::vulkan::Texture2D source;
+				source.aliasAttachment( this->getAttachment("bright") );
+				source.generateMipmaps( commandBuffer );
+			*/
+
+				// transition attachments back to shader read layouts
+				device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "setImageLayout" );
+				::transitionAttachmentsFrom( this, shader, commandBuffer );
+			}
+
+			if ( settings::pipelines::bloom && blitter.material.hasShader("compute", "bloom-up") ) {
+				auto& shader = blitter.material.getShader("compute", "bloom-up");
+				ext::vulkan::GraphicDescriptor descriptor = blitter.descriptor;
+				descriptor.renderMode = "";
+				descriptor.pipeline = "bloom-up";
 				descriptor.bind.width = width;
 				descriptor.bind.height = height;
 				descriptor.bind.depth = metadata.eyes;
@@ -879,20 +1032,8 @@ void ext::vulkan::DeferredRenderMode::createCommandBuffers( const uf::stl::vecto
 				::transitionAttachmentsTo( this, shader, commandBuffer );
 
 				// dispatch compute shader				
-				auto& attachmentColor = this->getAttachment("color"); // color 
-				auto& attachmentBright = this->getAttachment("bright"); // bloom
-				auto& attachmentScratch = this->getAttachment("scratch"); // pingpong
-
-				device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "bloom[1]" );
-				blitter.record( commandBuffer, descriptor, 0, 1 );
-				cmdImageBarrier( commandBuffer, attachmentScratch.image, VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_GENERAL );
-
-				device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "bloom[2]" );
-				blitter.record( commandBuffer, descriptor, 0, 2 );
-				cmdImageBarrier( commandBuffer, attachmentBright.image, VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_GENERAL );
-
-				device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "bloom[3]" );
-				blitter.record( commandBuffer, descriptor, 0, 3 );
+				device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "bloom[up]" );
+				blitter.record( commandBuffer, descriptor );
 
 				// transition attachments back to shader read layouts
 				device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "setImageLayout" );
@@ -900,80 +1041,51 @@ void ext::vulkan::DeferredRenderMode::createCommandBuffers( const uf::stl::vecto
 			}
 
 			// construct depth-pyramid
-		#if 1
 			if ( settings::pipelines::culling && blitter.material.hasShader("compute", "depth-pyramid") ) {
 				auto& shader = blitter.material.getShader("compute", "depth-pyramid");
 				auto mips = uf::vector::mips( pod::Vector2ui{ width, height } );
 
+				uint32_t dispatchX = (width + 63) / 64;
+				uint32_t dispatchY = (height + 63) / 64;
+				uint32_t numWorkGroups = dispatchX * dispatchY;
+				auto& pushConstant = shader.pushConstants.front().get<::PushConstants>();
+				pushConstant = {
+					.mips = mips,
+					.numWorkGroups = numWorkGroups,
+					.workGroupOffset = 0,
+				};
+
 				ext::vulkan::GraphicDescriptor descriptor = blitter.descriptor;
 				descriptor.renderMode = "";
 				descriptor.aux = mips;
 				descriptor.pipeline = "depth-pyramid";
+				descriptor.bind.width = dispatchX * 256;
+				descriptor.bind.height = dispatchY;
 				descriptor.bind.depth = metadata.eyes;
 				descriptor.bind.point = VK_PIPELINE_BIND_POINT_COMPUTE;
 				descriptor.subpass = 0;
 
-				// dispatch compute shader				
-				VkMemoryBarrier memoryBarrier{VK_STRUCTURE_TYPE_MEMORY_BARRIER};
-				memoryBarrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
-				memoryBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+				// reset counter buffer
+				vkCmdFillBuffer(commandBuffer, ::atomicCounterDepth.buffer, 0, 4, 0);
+				VkMemoryBarrier counterBarrier{VK_STRUCTURE_TYPE_MEMORY_BARRIER};
+				counterBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+				counterBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+				vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &counterBarrier, 0, nullptr, 0, nullptr);
 
 				device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "setImageLayout" );
 				::transitionAttachmentsTo( this, shader, commandBuffer );
 
-				for ( auto i = 0; i < mips; ++i ) {
-					// for some reason it dispatches at half the width without offsetting back...
-					descriptor.bind.width = std::max(1u, width >> (i - 1));
-					descriptor.bind.height = std::max(1u, height >> (i - 1));
-
-					blitter.record(commandBuffer, descriptor, 0, i);
-
-					vkCmdPipelineBarrier( commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_FLAGS_NONE, 1, &memoryBarrier, 0, NULL, 0, NULL );
-				}
+				blitter.record(commandBuffer, descriptor);
 
 				device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "setImageLayout" );
 				::transitionAttachmentsFrom( this, shader, commandBuffer );
 			}
-		#endif
+
 			// post-renderpass commands
 			VK_COMMAND_BUFFER_CALLBACK( CALLBACK_END, commandBuffer, frame, {
 				device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "callback[end]" );
 			} );
-
-		#if 0
-			if ( this->hasAttachment("depth") ) {
-				auto& attachment = this->getAttachment("depth");
-				ext::vulkan::Texture texture; texture.aliasAttachment( attachment );
-				texture.width = width;
-				texture.height = height;
-				texture.depth = 1;
-
-				texture.imageLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL;
-				texture.descriptor.imageLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL;
-			#if 1
-				imageMemoryBarrier.subresourceRange.layerCount = metadata.eyes;
-				imageMemoryBarrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
-				uf::renderer::Texture::setImageLayout( commandBuffer, attachment.image, VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, imageMemoryBarrier.subresourceRange );
-			#endif
-
-				for ( size_t eye = 0; eye < metadata.eyes; ++eye ) {
-					texture.generateMipmaps(commandBuffer, eye);
-				}
-
-			#if 1
-				uf::renderer::Texture::setImageLayout( commandBuffer, attachment.image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL, imageMemoryBarrier.subresourceRange );	
-				imageMemoryBarrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-				imageMemoryBarrier.subresourceRange.layerCount = 1;
-			#endif
-			}
 		#endif
-		#endif
-
-		/*
-			for ( auto layer : layers ) {
-				layer->pipelineBarrier( commandBuffer, 1 );
-			}
-		*/
 		}
 
 		device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::END, "end" );
diff --git a/engine/src/ext/vulkan/rendermodes/transition.inl b/engine/src/ext/vulkan/rendermodes/transition.inl
index 3cc15643..90d8feae 100644
--- a/engine/src/ext/vulkan/rendermodes/transition.inl
+++ b/engine/src/ext/vulkan/rendermodes/transition.inl
@@ -9,24 +9,41 @@ namespace {
 		subresourceRange.baseMipLevel = 0;
 		subresourceRange.levelCount = 1;
 		subresourceRange.baseArrayLayer = 0;
-		subresourceRange.layerCount = 1;
-		subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
 		subresourceRange.layerCount = self->metadata.eyes;
+		subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
 
 		for ( auto& descriptor : shader.metadata.aliases.attachments ) {
 			if ( descriptor.layout == VK_IMAGE_LAYOUT_UNDEFINED ) continue;
 			VkImage image = VK_NULL_HANDLE;
+			VkImageLayout initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+			size_t mips = 1;
+
 			if ( descriptor.renderMode ) {
-				if ( descriptor.renderMode->hasAttachment(descriptor.name) )
-					image = descriptor.renderMode->getAttachment(descriptor.name).image;
+				if ( descriptor.renderMode->hasAttachment(descriptor.name) ) {
+					auto& attachment = descriptor.renderMode->getAttachment(descriptor.name);
+					image = attachment.image;
+					mips = attachment.descriptor.mips;
+					initialLayout = attachment.descriptor.layout;
+				}
 
 			} else if ( self->hasAttachment(descriptor.name) ) {
-				if ( self->hasAttachment(descriptor.name) )
-					image = self->getAttachment(descriptor.name).image;
+				if ( self->hasAttachment(descriptor.name) ) {
+					auto& attachment = self->getAttachment(descriptor.name);
+					image = attachment.image;
+					mips = attachment.descriptor.mips;
+					initialLayout = attachment.descriptor.layout;
+				}
 			}
 			if ( image == VK_NULL_HANDLE ) continue;
+			subresourceRange.baseMipLevel = 0;
+            subresourceRange.levelCount = 1;
 			subresourceRange.aspectMask = descriptor.name == "depth" ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT;
 			uf::renderer::Texture::setImageLayout( commandBuffer, image, layout, descriptor.layout, subresourceRange );
+			if ( mips > 1 ) {
+                subresourceRange.baseMipLevel = 1;
+                subresourceRange.levelCount = VK_REMAINING_MIP_LEVELS;
+                uf::renderer::Texture::setImageLayout( commandBuffer, image, initialLayout, descriptor.layout, subresourceRange );
+            }
 		}
 	}
 	void transitionAttachmentsFrom(
@@ -39,24 +56,41 @@ namespace {
 		subresourceRange.baseMipLevel = 0;
 		subresourceRange.levelCount = 1;
 		subresourceRange.baseArrayLayer = 0;
-		subresourceRange.layerCount = 1;
-		subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
 		subresourceRange.layerCount = self->metadata.eyes;
+		subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
 
 		for ( auto& descriptor : shader.metadata.aliases.attachments ) {
 			if ( descriptor.layout == VK_IMAGE_LAYOUT_UNDEFINED ) continue;
 			VkImage image = VK_NULL_HANDLE;
+			VkImageLayout initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+			size_t mips = 1;
+
 			if ( descriptor.renderMode ) {
-				if ( descriptor.renderMode->hasAttachment(descriptor.name) )
-					image = descriptor.renderMode->getAttachment(descriptor.name).image;
+				if ( descriptor.renderMode->hasAttachment(descriptor.name) ) {
+					auto& attachment = descriptor.renderMode->getAttachment(descriptor.name);
+					image = attachment.image;
+					mips = attachment.descriptor.mips;
+					initialLayout = attachment.descriptor.layout;
+				}
 
 			} else if ( self->hasAttachment(descriptor.name) ) {
-				if ( self->hasAttachment(descriptor.name) )
-					image = self->getAttachment(descriptor.name).image;
+				if ( self->hasAttachment(descriptor.name) ) {
+					auto& attachment = self->getAttachment(descriptor.name);
+					image = attachment.image;
+					mips = attachment.descriptor.mips;
+					initialLayout = attachment.descriptor.layout;
+				}
 			}
 			if ( image == VK_NULL_HANDLE ) continue;
+			subresourceRange.baseMipLevel = 0;
+            subresourceRange.levelCount = 1;
 			subresourceRange.aspectMask = descriptor.name == "depth" ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT;
 			uf::renderer::Texture::setImageLayout( commandBuffer, image, descriptor.layout, layout, subresourceRange );
+			if ( mips > 1 ) {
+                subresourceRange.baseMipLevel = 1;
+                subresourceRange.levelCount = VK_REMAINING_MIP_LEVELS;
+                uf::renderer::Texture::setImageLayout( commandBuffer, image, descriptor.layout, initialLayout, subresourceRange );
+            }
 		}
 	}
 }
\ No newline at end of file
diff --git a/engine/src/ext/vulkan/shader.cpp b/engine/src/ext/vulkan/shader.cpp
index fedc8af2..fc42393a 100644
--- a/engine/src/ext/vulkan/shader.cpp
+++ b/engine/src/ext/vulkan/shader.cpp
@@ -12,8 +12,8 @@
 #include <fstream>
 #include <regex>
 
-#define VK_DEBUG_VALIDATION_MESSAGE(x)\
-//	VK_VALIDATION_MESSAGE(x);
+#define VK_DEBUG_VALIDATION_MESSAGE(...)\
+	//VK_VALIDATION_MESSAGE(__VA_ARGS__);
 
 #define UF_SHADER_PARSE_AS_JSON 0
 #if UF_SHADER_PARSE_AS_JSON
@@ -94,7 +94,7 @@ ext::vulkan::userdata_t ext::vulkan::jsonToUserdata( const ext::json::Value& pay
 	#if UF_SHADER_TRACK_NAMES
 		uf::stl::string path = uf::string::join(variableName, ".");
 		path = uf::string::replace( path, ".[", "[" );
-		VK_VALIDATION_MESSAGE("[" << (byteBuffer - byteBufferStart) << " / "<< (byteBufferEnd - byteBuffer) <<"]\tInserting: " << path << " = " << value.dump());
+		//VK_VALIDATION_MESSAGE("[" << (byteBuffer - byteBufferStart) << " / "<< (byteBufferEnd - byteBuffer) <<"]\tInserting: " << path << " = " << value.dump());
 	#endif
 		// is strictly an int
 		if ( value.is<int>(true) ) {
@@ -120,7 +120,7 @@ ext::vulkan::userdata_t ext::vulkan::jsonToUserdata( const ext::json::Value& pay
 	#endif
 	};
 #if UF_SHADER_TRACK_NAMES
-	VK_VALIDATION_MESSAGE("Updating {} in {}", name, filename);
+	//VK_VALIDATION_MESSAGE("Updating {} in {}", name, filename);
 //	VK_VALIDATION_MESSAGE("Iterator: " << (void*) byteBuffer << "\t" << (void*) byteBufferEnd << "\t" << (byteBufferEnd - byteBuffer));
 #endif
 	parse(payload);
@@ -264,7 +264,7 @@ ext::vulkan::userdata_t ext::vulkan::jsonToUserdata( const ext::json::Value& pay
 			#if UF_SHADER_TRACK_NAMES
 				uf::stl::string path = uf::string::join(variableName, ".");
 				path = uf::string::replace( path, ".[", "[" );
-				VK_VALIDATION_MESSAGE("[" << (byteBuffer - byteBufferStart) << " / "<< (byteBufferEnd - byteBuffer) <<"]\tInserting: " << path << " = (" << primitive << ") " << input.dump());
+				//VK_VALIDATION_MESSAGE("[" << (byteBuffer - byteBufferStart) << " / "<< (byteBufferEnd - byteBuffer) <<"]\tInserting: " << path << " = (" << primitive << ") " << input.dump());
 			#endif
 				pushValue( primitive, input );
 			}
@@ -275,12 +275,12 @@ ext::vulkan::userdata_t ext::vulkan::jsonToUserdata( const ext::json::Value& pay
 	};
 	auto& definitions = metadata.json["definitions"]["uniforms"][name];
 #if UF_SHADER_TRACK_NAMES
-	VK_VALIDATION_MESSAGE("Updating " << name << " in " << filename);
-	VK_VALIDATION_MESSAGE("Iterator: " << (void*) byteBuffer << "\t" << (void*) byteBufferEnd << "\t" << (byteBufferEnd - byteBuffer));
+	//VK_VALIDATION_MESSAGE("Updating " << name << " in " << filename);
+	//VK_VALIDATION_MESSAGE("Iterator: " << (void*) byteBuffer << "\t" << (void*) byteBufferEnd << "\t" << (byteBufferEnd - byteBuffer));
 #endif
 	parseDefinition(payload, definitions);
 #if UF_SHADER_TRACK_NAMES
-	VK_VALIDATION_MESSAGE("Iterator: " << (void*) byteBuffer << "\t" << (void*) byteBufferEnd << "\t" << (byteBufferEnd - byteBuffer));
+	//VK_VALIDATION_MESSAGE("Iterator: " << (void*) byteBuffer << "\t" << (void*) byteBufferEnd << "\t" << (byteBufferEnd - byteBuffer));
 #endif
 #endif
 	return userdata;
@@ -489,14 +489,14 @@ void ext::vulkan::Shader::initialize( ext::vulkan::Device& device, const uf::stl
 					size_t bufferSize = comp.get_declared_struct_size(base_type);
 					if ( bufferSize <= 0 ) break;
 					if ( bufferSize > device.properties.limits.maxUniformBufferRange ) {
-						VK_DEBUG_VALIDATION_MESSAGE("Invalid uniform buffer length of " << bufferSize << " for shader " << filename);
+						VK_DEBUG_VALIDATION_MESSAGE("Invalid uniform buffer length of {} for shader {}", bufferSize, filename);
 						bufferSize = device.properties.limits.maxUniformBufferRange;
 					}
 
 					bufferSize = ALIGNED_SIZE( bufferSize, device.properties.limits.minUniformBufferOffsetAlignment );
 
 					{
-						VK_DEBUG_VALIDATION_MESSAGE("Uniform size of " << bufferSize << " for shader " << filename);
+						VK_DEBUG_VALIDATION_MESSAGE("Uniform size of {} for shader {}", bufferSize, filename);
 					//	auto& uniform = uniforms.emplace_back();
 					//	uniform.create( bufferSize );
 					}
@@ -564,7 +564,7 @@ void ext::vulkan::Shader::initialize( ext::vulkan::Device& device, const uf::stl
 
 		#define LOOP_RESOURCES( key, type ) for ( size_t i = 0; i < res.key.size(); ++i ) {\
 			const auto& resource = res.key[i];\
-			VK_DEBUG_VALIDATION_MESSAGE("["<<filename<<"] Found resource: "#type " with binding: " << comp.get_decoration(resource.id, spv::DecorationBinding));\
+			VK_DEBUG_VALIDATION_MESSAGE("[{}] Found resource: {} with binding: {}", filename, #type, comp.get_decoration(resource.id, spv::DecorationBinding));\
 			parseResource( resource, type, i );\
 		}
 		LOOP_RESOURCES( sampled_images, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER );
@@ -667,16 +667,17 @@ void ext::vulkan::Shader::initialize( ext::vulkan::Device& device, const uf::stl
 			if ( size <= 0 ) continue;
 			// not a multiple of 4, for some reason
 			if ( size % 4 != 0 ) {
-				VK_DEBUG_VALIDATION_MESSAGE("Invalid push constant length of " << size << " for shader " << filename << ", must be multiple of 4, correcting...");
+				VK_DEBUG_VALIDATION_MESSAGE("Invalid push constant length of {} for shader {}, must be multiple of 4, correcting...", size, filename);
 				size /= 4;
 				++size; 
 				size *= 4;
 			}
 			if ( size > device.properties.limits.maxPushConstantsSize ) {
-				VK_DEBUG_VALIDATION_MESSAGE("Invalid push constant length of " << size << " for shader " << filename);
+				VK_DEBUG_VALIDATION_MESSAGE("Invalid push constant length of {} for shader {}", size, filename);
+				//VK_DEBUG_VALIDATION_MESSAGE("Invalid push constant length of " << size << " for shader " << filename);
 				size = device.properties.limits.maxPushConstantsSize;
 			}
-			VK_DEBUG_VALIDATION_MESSAGE("Push constant size of " << size << " for shader " << filename);
+			VK_DEBUG_VALIDATION_MESSAGE("Push constant size of {} for shader {},", size, filename);
 			{
 				auto& pushConstant = pushConstants.emplace_back();
 				pushConstant.create( size );
@@ -724,7 +725,7 @@ void ext::vulkan::Shader::initialize( ext::vulkan::Device& device, const uf::stl
 				specializationMapEntries.emplace_back(specializationMapEntry);
 			}
 			specializationConstants.create( specializationSize );
-			VK_DEBUG_VALIDATION_MESSAGE("Specialization constants size of " << specializationSize << " for shader " << filename);
+			VK_DEBUG_VALIDATION_MESSAGE("Specialization constants size of {} for shader {}", specializationSize, filename);
 
 			uint8_t* s = (uint8_t*) (void*) specializationConstants;
 			size_t offset = 0;
@@ -798,7 +799,7 @@ void ext::vulkan::Shader::initialize( ext::vulkan::Device& device, const uf::stl
 						definition.validate = false;
 					} break;
 					default: {
-						VK_DEBUG_VALIDATION_MESSAGE("Unregistered specialization constant type at offset " << offset << " for shader " << filename );
+						VK_DEBUG_VALIDATION_MESSAGE("Unregistered specialization constant type at offset {} for shader {}", offset, filename );
 					} break;
 				}
 			#if UF_SHADER_PARSE_AS_JSON
@@ -806,7 +807,7 @@ void ext::vulkan::Shader::initialize( ext::vulkan::Device& device, const uf::stl
 				member["size"] = size;
 				member["default"] = member["value"];
 				metadata.json["specializationConstants"].emplace_back(member);
-				VK_DEBUG_VALIDATION_MESSAGE("Specialization constant: " << member["type"].as<uf::stl::string>() << " " << name << " = " << member["value"].dump() << "; at offset " << offset << " for shader " << filename );
+				//VK_DEBUG_VALIDATION_MESSAGE("Specialization constant: " << member["type"].as<uf::stl::string>() << " " << name << " = " << member["value"].dump() << "; at offset " << offset << " for shader " << filename );
 			#endif
 
 				memcpy( &s[offset], &buffer, size );
@@ -859,7 +860,7 @@ bool ext::vulkan::Shader::validate() {
 		if ( it == uniforms.end() ) break;
 		auto& uniform = *(it++);
 		if ( uniform.data().len != buffer.allocationInfo.size ) {
-			VK_DEBUG_VALIDATION_MESSAGE("Uniform size mismatch: Expected " << buffer.allocationInfo.size << ", got " << uniform.data().len << "; fixing...");
+			VK_DEBUG_VALIDATION_MESSAGE("Uniform size mismatch: Expected {}, got {}; fixing...", buffer.allocationInfo.size, uniform.data().len);
 			uniform.destroy();
 			uniform.create(buffer.allocationInfo.size);
 			valid = false;
diff --git a/engine/src/ext/vulkan/texture.cpp b/engine/src/ext/vulkan/texture.cpp
index 28ff0ba7..94c2963d 100644
--- a/engine/src/ext/vulkan/texture.cpp
+++ b/engine/src/ext/vulkan/texture.cpp
@@ -205,12 +205,15 @@ void ext::vulkan::Texture::destroy( bool defer ) {
 	if ( !device || !device->logicalDevice || aliased ) return; // device->logicalDevice should never be null, but it happens, somehow
 
 	if ( defer ) {
-		ext::vulkan::gc::textures.emplace_back( *this );
+		ext::vulkan::mutex.lock();
+		device->transient.textures.emplace_back(*this);
+		ext::vulkan::mutex.unlock();
 		return;
 	}
 
 	if ( view != VK_NULL_HANDLE ) {
 		vkDestroyImageView(device->logicalDevice, view, nullptr);
+		VK_UNREGISTER_HANDLE( view );
 		view = VK_NULL_HANDLE;
 	}
 	if ( image != VK_NULL_HANDLE ) {
@@ -581,6 +584,7 @@ void ext::vulkan::Texture::fromBuffers(
 	viewCreateInfo.subresourceRange.levelCount = this->mips;
 	viewCreateInfo.image = image;
 	VK_CHECK_RESULT(vkCreateImageView(device.logicalDevice, &viewCreateInfo, nullptr, &view));
+	VK_REGISTER_HANDLE( view );
 
 	{
 		auto commandBuffer = device.fetchCommandBuffer(uf::renderer::QueueEnum::GRAPHICS);
@@ -890,7 +894,7 @@ void ext::vulkan::Texture::generateMipmaps( VkCommandBuffer commandBuffer, uint3
 
 	int32_t mipWidth = width;
 	int32_t mipHeight = height;
-	int32_t mipDepth = depth;
+	int32_t mipDepth = MAX(1, depth);
 	for ( size_t i = 1; i < this->mips; ++i ) {
 		// transition previous layer to read from it
 		barrier.subresourceRange.baseMipLevel = i - 1;
@@ -1150,7 +1154,7 @@ uf::Image ext::vulkan::Texture3D::screenshot( uint32_t layerID ) {
 		imageCopy.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
 		imageCopy.dstSubresource.baseArrayLayer = 0;
 		imageCopy.dstSubresource.layerCount = 1;
-		imageCopy.dstOffset = { 0, 0, 0 };
+		imageCopy.dstOffset = { 0, 0, layerID };
 		imageCopy.extent = { this->width, this->height, 1 };
 
 		device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "copyImage" );
diff --git a/engine/src/ext/vulkan/vulkan.cpp b/engine/src/ext/vulkan/vulkan.cpp
index 52640966..6448be5e 100644
--- a/engine/src/ext/vulkan/vulkan.cpp
+++ b/engine/src/ext/vulkan/vulkan.cpp
@@ -116,8 +116,6 @@ uint32_t ext::vulkan::states::frameAccumulate = 0;
 bool ext::vulkan::states::frameAccumulateReset = false;
 uint32_t ext::vulkan::states::frameSkip = 0;
 
-uf::stl::vector<ext::vulkan::Texture> ext::vulkan::gc::textures;
-
 uf::ThreadUnique<ext::vulkan::RenderMode*> ext::vulkan::currentRenderMode;
 
 ext::vulkan::Buffer ext::vulkan::scratchBuffer;
@@ -495,6 +493,7 @@ void ext::vulkan::initialize( bool soft ) {
 void ext::vulkan::tick() {
 //	ext::vulkan::mutex.lock();
 	if ( ext::vulkan::states::resized || ext::vulkan::settings::experimental::rebuildOnTickBegin ) {
+		synchronize(0b11);
 		ext::vulkan::states::rebuild = true;
 		::skip = true;
 	}
@@ -534,13 +533,6 @@ void ext::vulkan::tick() {
 
 	uf::thread::execute( tasks );
 
-/*
-	for ( auto& texture : ext::vulkan::gc::textures ) {
-		texture.destroy( false );
-	}
-	ext::vulkan::gc::textures.clear();
-*/
-
 	if ( ext::vulkan::states::rebuild && ext::vulkan::settings::experimental::skipRenderOnRebuild ) ::skip = true;
 	
 	ext::vulkan::states::rebuild = false;
@@ -666,6 +658,9 @@ void ext::vulkan::render() {
 	for ( auto& buffer : transient.buffers ) buffer.destroy(false);
 	transient.buffers.clear();
 	
+	for ( auto& texture : transient.textures ) texture.destroy(false);
+	transient.textures.clear();
+	
 	for ( auto& as : transient.ass ) {
 		uf::renderer::vkDestroyAccelerationStructureKHR(device, as.handle, nullptr);
 		VK_UNREGISTER_HANDLE( as.handle );
@@ -677,7 +672,7 @@ void ext::vulkan::destroy( bool soft ) {
 	ext::vulkan::flushCommandBuffers();
 
 //	ext::vulkan::mutex.lock();
-	synchronize();
+	synchronize(0b11);
 
 #if UF_USE_FFX_FSR
 	if ( settings::pipelines::fsr ) {
@@ -725,7 +720,7 @@ void ext::vulkan::destroy( bool soft ) {
 //	ext::vulkan::mutex.unlock();
 
 	// check for any leaked resources
-	if ( false ) {
+	if ( ext::vulkan::settings::validation::checkpoints ) {
 		UF_MSG_DEBUG("Leaked resources:");
 
 		for ( auto& resource : ext::vulkan::Resource<VkBuffer_T*>::handles ) {