From 6d05ab865a02aa9b6c405c86eb19bfc74f047965 Mon Sep 17 00:00:00 2001 From: ecker Date: Tue, 21 Apr 2026 20:49:47 -0500 Subject: [PATCH] neurotic optimizations (overhauled depth pyramid and bloom to ffx-sdp), more fixes --- bin/data/config.json | 5 +- .../scenes/sourceengine/sourceengine.json | 4 +- bin/data/shaders/common/functions.h | 3 +- bin/data/shaders/display/bloom/comp.glsl | 65 - bin/data/shaders/display/bloom/down.comp.glsl | 131 ++ bin/data/shaders/display/bloom/up.comp.glsl | 60 + bin/data/shaders/display/deferred/comp/comp.h | 4 +- .../shaders/display/depth-pyramid/comp.glsl | 103 +- bin/data/shaders/ext/ffx_a.h | 1907 +++++++++++++++++ bin/data/shaders/ext/ffx_spd.h | 1297 +++++++++++ bin/data/shaders/graph/cull/comp.glsl | 190 +- bin/data/shaders/raytrace/shader.ray-gen.glsl | 2 +- engine/inc/uf/engine/graph/graph.h | 2 + engine/inc/uf/ext/vulkan/device.h | 3 + engine/inc/uf/ext/vulkan/graphic.h | 4 +- engine/inc/uf/ext/vulkan/texture.h | 2 +- engine/inc/uf/ext/vulkan/vk.h | 2 +- engine/inc/uf/ext/vulkan/vulkan.h | 4 - engine/src/engine/ext/scene/behavior.cpp | 12 +- engine/src/engine/graph/graph.cpp | 6 +- engine/src/ext/vulkan/graphic.cpp | 33 +- engine/src/ext/vulkan/rendermodes/base.cpp | 15 +- .../src/ext/vulkan/rendermodes/deferred.cpp | 444 ++-- .../src/ext/vulkan/rendermodes/transition.inl | 58 +- engine/src/ext/vulkan/shader.cpp | 37 +- engine/src/ext/vulkan/texture.cpp | 10 +- engine/src/ext/vulkan/vulkan.cpp | 17 +- 27 files changed, 3986 insertions(+), 434 deletions(-) delete mode 100644 bin/data/shaders/display/bloom/comp.glsl create mode 100644 bin/data/shaders/display/bloom/down.comp.glsl create mode 100644 bin/data/shaders/display/bloom/up.comp.glsl create mode 100644 bin/data/shaders/ext/ffx_a.h create mode 100644 bin/data/shaders/ext/ffx_spd.h diff --git a/bin/data/config.json b/bin/data/config.json index 950847d9..da51c7a9 100644 --- a/bin/data/config.json +++ b/bin/data/config.json @@ -110,7 +110,7 @@ "default stage buffers": true, "default defer buffer destroy": true, "default command buffer immediate": true, - "multithreaded recording": true + "multithreaded recording": false }, "pipelines": { "deferred": true, @@ -147,7 +147,8 @@ "deviceCoherentMemory", "robustBufferAccess", "samplerAnisotropy", - "sampleRateShading" + "sampleRateShading", + "samplerFilterMinmax" ], "featureChain": [] }, diff --git a/bin/data/scenes/sourceengine/sourceengine.json b/bin/data/scenes/sourceengine/sourceengine.json index feac6dec..95a20fe6 100644 --- a/bin/data/scenes/sourceengine/sourceengine.json +++ b/bin/data/scenes/sourceengine/sourceengine.json @@ -1,9 +1,9 @@ { // "import": "./rp_downtown_v2.json" - "import": "./ss2_medsci1.json" +// "import": "./ss2_medsci1.json" // "import": "./test_grid.json" // "import": "./sh2_mcdonalds.json" // "import": "./animal_crossing.json" -// "import": "./mds_mcdonalds.json" + "import": "./mds_mcdonalds.json" // "import": "./gm_construct.json" } \ No newline at end of file diff --git a/bin/data/shaders/common/functions.h b/bin/data/shaders/common/functions.h index 95ce4718..e626031a 100644 --- a/bin/data/shaders/common/functions.h +++ b/bin/data/shaders/common/functions.h @@ -44,6 +44,7 @@ void gammaCorrect( inout vec3 color, float gamma ) { } void toneMap( inout vec4 color, float exposure ) { toneMap(color.rgb, exposure); } void gammaCorrect( inout vec4 color, float gamma ) { gammaCorrect(color.rgb, gamma); } +float luma( vec3 color ) { return dot(color, vec3(0.2126, 0.7152, 0.0722)); } // uint tea(uint val0, uint val1) { uint v0 = val0; @@ -152,6 +153,7 @@ vec3 decodeSrgb(vec3 rgb) { const vec3 c = step(vec3(0.04045), rgb); return mix(a, b, c); } +#if !SPD && (DEFERRED || FRAGMENT || COMPUTE || RT) bool validTextureIndex( int textureIndex ) { return 0 <= textureIndex && textureIndex < MAX_TEXTURES; } @@ -160,7 +162,6 @@ bool validCubemapIndex( int textureIndex ) { return 0 <= textureIndex && textureIndex < MAX_CUBEMAPS; } #endif -#if !BLOOM && (DEFERRED || FRAGMENT || COMPUTE || RT) bool validTextureIndex( uint id ) { return 0 <= id && id < MAX_TEXTURES; } diff --git a/bin/data/shaders/display/bloom/comp.glsl b/bin/data/shaders/display/bloom/comp.glsl deleted file mode 100644 index 058764f3..00000000 --- a/bin/data/shaders/display/bloom/comp.glsl +++ /dev/null @@ -1,65 +0,0 @@ -#version 450 -#pragma shader_stage(compute) - -#define COMPUTE 1 -#define TEXTURES 0 -#define CUBEMAPS 0 -#define BLOOM 1 - -layout (local_size_x = 16, local_size_y = 16, local_size_z = 1) in; - -layout( push_constant ) uniform PushBlock { - uint eye; - uint mode; -} PushConstant; - -layout (binding = 0) uniform UBO { - float threshold; - float smoothness; - uint size; - float padding1; - - float weights[32]; -} ubo; - -layout (binding = 1, rgba16f) uniform image2D imageColor; -layout (binding = 2, rgba16f) uniform image2D imageBloom; -layout (binding = 3, rgba16f) uniform image2D imagePingPong; - -#include "../../common/macros.h" -#include "../../common/structs.h" -#include "../../common/functions.h" - -void main() { - const uint mode = PushConstant.mode; - const ivec2 texel = ivec2(gl_GlobalInvocationID.xy); - const ivec2 size = imageSize( imageColor ); - if ( texel.x >= size.x || texel.y >= size.y ) return; - - if ( mode == 0 ) { // fill bloom - vec3 result = imageLoad( imageColor, texel ).rgb; - float brightness = dot(result, vec3(0.2126, 0.7152, 0.0722)); - if( brightness < ubo.threshold ) result = vec3(0.0); - imageStore(imageBloom, texel, vec4(result, 1.0)); - } else if ( mode == 1 ) { // bloom horizontal - vec3 result = imageLoad( imageBloom, texel ).rgb * ubo.weights[0]; - for ( int i = 1; i < int(ubo.size); ++i ) { - vec3 c1 = imageLoad( imageBloom, texel + ivec2(i, 0) ).rgb; - vec3 c2 = imageLoad( imageBloom, texel - ivec2(i, 0) ).rgb; - result += (c1 + c2) * ubo.weights[i]; - } - imageStore( imagePingPong, texel, vec4(result, 1.0) ); - } else if ( mode == 2 ) { // bloom vertical - vec3 result = imageLoad( imagePingPong, texel ).rgb * ubo.weights[0]; - for( int i = 1; i < int(ubo.size); ++i ) { - vec3 c1 = imageLoad( imagePingPong, texel + ivec2(0, i) ).rgb; - vec3 c2 = imageLoad( imagePingPong, texel - ivec2(0, i) ).rgb; - result += (c1 + c2) * ubo.weights[i]; - } - imageStore(imageBloom, texel, vec4(result, 1.0)); - } else if ( mode == 3 ) { // combine - vec3 base = imageLoad( imageColor, texel ).rgb; - vec3 bloom = imageLoad( imageBloom, texel ).rgb; - imageStore( imageColor, texel, vec4(base + bloom, 1.0) ); - } -} \ No newline at end of file diff --git a/bin/data/shaders/display/bloom/down.comp.glsl b/bin/data/shaders/display/bloom/down.comp.glsl new file mode 100644 index 00000000..02f350c3 --- /dev/null +++ b/bin/data/shaders/display/bloom/down.comp.glsl @@ -0,0 +1,131 @@ +#version 450 +#pragma shader_stage(compute) + +#extension GL_KHR_shader_subgroup_quad : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_EXT_samplerless_texture_functions : enable + +#define COMPUTE 1 +#define SPD 1 + +#include "../../common/macros.h" +#include "../../common/structs.h" +#include "../../common/functions.h" + +layout (local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout (constant_id = 0) const uint MIPS = 6; + +layout(push_constant) uniform PushBlock { + uint mips; + uint numWorkGroups; + uint workGroupOffset; +} PushConstant_; + + +layout (binding = 0, rgba16f) uniform image2D imageColor; +layout (binding = 1, rgba16f) uniform image2D imageBright; // yucky +layout (binding = 2, rgba16f) coherent uniform image2D outImage[MIPS]; + +layout (binding = 3, std430) buffer AtomicCounter { + uint counter; +} spdCounter; + +layout (binding = 4) uniform UBO { + float threshold; + float smoothness; + uint size; + float padding1; + + float weights[32]; +} ubo; + +#define A_GLSL 1 +#define A_GPU 1 +#define SPD_NO_WAVE_OPERATIONS 0 +#include "../../ext/ffx_a.h" + +shared AU1 spd_counter; +shared AF4 spd_intermediate[16][16]; + +vec3 applySoftKnee(vec3 color, float luminance) { + float rq = clamp(luminance - ubo.threshold + ubo.smoothness, 0.0, 2.0 * ubo.smoothness); + rq = (rq * rq) / (4.0 * ubo.smoothness + 0.0001); + + float value = max(rq, luminance - ubo.threshold); + + return color * (value / (max(luminance, 0.0001))); +} + +AF4 SpdLoadSourceImage(ASU2 p, AU1 slice) { + ivec2 size = imageSize(imageColor); + + // sample color if in bound, else black + vec3 c0 = p.x < size.x && p.y < size.y ? imageLoad(imageColor, p + ivec2(0, 0)).rgb : vec3(0.0); + vec3 c1 = p.x + 1 < size.x && p.y < size.y ? imageLoad(imageColor, p + ivec2(1, 0)).rgb : vec3(0.0); + vec3 c2 = p.x < size.x && p.y + 1 < size.y ? imageLoad(imageColor, p + ivec2(0, 1)).rgb : vec3(0.0); + vec3 c3 = p.x + 1 < size.x && p.y + 1 < size.y ? imageLoad(imageColor, p + ivec2(1, 1)).rgb : vec3(0.0); + + // get luma + float b0 = luma(c0); + float b1 = luma(c1); + float b2 = luma(c2); + float b3 = luma(c3); + + // soften + c0 = applySoftKnee(c0, b0); + c1 = applySoftKnee(c1, b1); + c2 = applySoftKnee(c2, b2); + c3 = applySoftKnee(c3, b3); + + // karis luma weighted average + float w0 = 1.0 / (b0 + 1.0); + float w1 = 1.0 / (b1 + 1.0); + float w2 = 1.0 / (b2 + 1.0); + float w3 = 1.0 / (b3 + 1.0); + float inv_wsum = 1.0 / (w0 + w1 + w2 + w3); + + // store to mip 0 + if (p.x < size.x && p.y < size.y) imageStore(outImage[0], p + ivec2(0, 0), vec4(c0, 1.0)); + if (p.x + 1 < size.x && p.y < size.y) imageStore(outImage[0], p + ivec2(1, 0), vec4(c1, 1.0)); + if (p.x < size.x && p.y + 1 < size.y) imageStore(outImage[0], p + ivec2(0, 1), vec4(c2, 1.0)); + if (p.x + 1 < size.x && p.y + 1 < size.y) imageStore(outImage[0], p + ivec2(1, 1), vec4(c3, 1.0)); + + // average + return AF4((c0 * w0 + c1 * w1 + c2 * w2 + c3 * w3) * inv_wsum, 1.0); +} + +AF4 SpdLoad(ASU2 p, AU1 slice) { + uint loadMip = min(6u - 1, MIPS - 1); + return imageLoad(outImage[loadMip + 1], p); +} + +void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice) { + if ( mip + 1 < MIPS ) { + imageStore(outImage[mip + 1], p, value); + } +} + +// average filter +AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3) { + return (v0 + v1 + v2 + v3) * 0.25; +} + +AF4 SpdLoadIntermediate(AU1 x, AU1 y) { return spd_intermediate[x][y]; } +void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value) { spd_intermediate[x][y] = value; } + +void SpdIncreaseAtomicCounter(AU1 slice) { spd_counter = atomicAdd(spdCounter.counter, 1); } +AU1 SpdGetAtomicCounter() { return spd_counter; } +void SpdResetAtomicCounter(AU1 slice) { spdCounter.counter = 0; } + +#include "../../ext/ffx_spd.h" + +void main() { + SpdDownsample( + AU2(gl_WorkGroupID.xy), + AU1(gl_LocalInvocationIndex), + AU1(PushConstant_.mips - 1), + AU1(PushConstant_.numWorkGroups), + AU1(PushConstant_.workGroupOffset) + ); +} \ No newline at end of file diff --git a/bin/data/shaders/display/bloom/up.comp.glsl b/bin/data/shaders/display/bloom/up.comp.glsl new file mode 100644 index 00000000..057fc76f --- /dev/null +++ b/bin/data/shaders/display/bloom/up.comp.glsl @@ -0,0 +1,60 @@ +#version 450 +#pragma shader_stage(compute) + +#define COMPUTE 1 + +layout (local_size_x = 16, local_size_y = 16, local_size_z = 1) in; + +layout (constant_id = 0) const uint MIPS = 6; + +layout (binding = 0, rgba16f) uniform image2D imageColor; +layout (binding = 1) uniform sampler2D samplerBloom; + +layout (binding = 2) uniform UBO { + float threshold; + float smoothness; + uint size; + float padding1; + + float weights[32]; +} ubo; + +// 9-tap bilinear tent filter +vec3 tentFilter(sampler2D tex, vec2 uv, float lod) { + vec2 texSize = vec2(textureSize(tex, int(lod))); + vec4 d = (1.0 / texSize.xyxy) * vec4(1.0, 1.0, -1.0, 0.0); + + vec3 s = textureLod(tex, uv - d.xy, lod).rgb; + s += textureLod(tex, uv - d.wy, lod).rgb * 2.0; + s += textureLod(tex, uv - d.zy, lod).rgb; + s += textureLod(tex, uv + d.zw, lod).rgb * 2.0; + s += textureLod(tex, uv, lod).rgb * 4.0; + s += textureLod(tex, uv + d.xw, lod).rgb * 2.0; + s += textureLod(tex, uv + d.zy, lod).rgb; + s += textureLod(tex, uv + d.wy, lod).rgb * 2.0; + s += textureLod(tex, uv + d.xy, lod).rgb; + + return s * (1.0 / 16.0); +} + +void main() { + ivec2 texel = ivec2(gl_GlobalInvocationID.xy); + ivec2 size = imageSize(imageColor); + if ( texel.x >= size.x || texel.y >= size.y ) return; + + vec2 uv = (vec2(texel) + 0.5) / vec2(size); + vec3 bloomAcc = vec3(0.0); + float weightSum = 0.0; + + for ( uint i = 0; i < min(MIPS, ubo.size); ++i ) { + float w = ubo.weights[i]; + bloomAcc += textureLod(samplerBloom, uv, float(i)).rgb * w; + //bloomAcc += tentFilter(samplerBloom, uv, float(i)) * w; + weightSum += w; + } + + if ( weightSum > 0.0 ) bloomAcc /= weightSum; + + vec3 base = imageLoad( imageColor, texel ).rgb; + imageStore( imageColor, texel, vec4(base + bloomAcc, 1.0) ); +} \ No newline at end of file diff --git a/bin/data/shaders/display/deferred/comp/comp.h b/bin/data/shaders/display/deferred/comp/comp.h index 9e37b8db..22e32141 100644 --- a/bin/data/shaders/display/deferred/comp/comp.h +++ b/bin/data/shaders/display/deferred/comp/comp.h @@ -159,7 +159,7 @@ void postProcess() { #if FOG fog( surface.ray, surface.fragment.rgb, surface.fragment.a ); #endif - float brightness = dot(surface.fragment.rgb, vec3(0.2126, 0.7152, 0.0722)); + float brightness = luma(surface.fragment.rgb); bool bloom = brightness > ubo.settings.bloom.threshold; //if ( bloom ) toneMap( surface.fragment.rgb, brightness ); vec4 outFragColor = vec4(surface.fragment.rgb, 1.0); @@ -184,7 +184,7 @@ void postProcess() { } IMAGE_STORE( imageColor, outFragColor ); - IMAGE_STORE( imageBright, outFragBright ); + //IMAGE_STORE( imageBright, outFragBright ); IMAGE_STORE( imageMotion, vec4(outFragMotion, 0, 0) ); } diff --git a/bin/data/shaders/display/depth-pyramid/comp.glsl b/bin/data/shaders/display/depth-pyramid/comp.glsl index efbaa5e5..2285e7df 100644 --- a/bin/data/shaders/display/depth-pyramid/comp.glsl +++ b/bin/data/shaders/display/depth-pyramid/comp.glsl @@ -1,35 +1,96 @@ #version 450 #pragma shader_stage(compute) -//#extension GL_EXT_nonuniform_qualifier : enable - -layout (constant_id = 0) const uint MIPS = 6; -layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in; +#extension GL_KHR_shader_subgroup_quad : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_EXT_samplerless_texture_functions : enable #define COMPUTE 1 +#define SPD 1 #include "../../common/macros.h" #include "../../common/structs.h" +#include "../../common/functions.h" -layout( push_constant ) uniform PushBlock { - uint _; - uint pass; -} PushConstant; +layout (local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout (constant_id = 0) const uint MIPS = 6; + +layout(push_constant) uniform PushBlock { + uint mips; + uint numWorkGroups; + uint workGroupOffset; +} PushConstant_; layout (binding = 0) uniform sampler2D samplerDepth; -layout (binding = 1) uniform sampler2D inImage[MIPS]; -layout (binding = 2, r32f) uniform writeonly image2D outImage[MIPS]; +layout (binding = 1, r32f) coherent uniform image2D outImage[MIPS]; + +layout (binding = 2, std430) buffer AtomicCounter { + uint counter; +} spdCounter; + + +#define A_GLSL 1 +#define A_GPU 1 +#define SPD_NO_WAVE_OPERATIONS 0 +#include "../../ext/ffx_a.h" + +shared AU1 spd_counter; +shared AF1 spd_intermediate[16][16]; + +AF4 SpdLoadSourceImage(ASU2 p, AU1 slice) { + ivec2 size = imageSize(outImage[0]); + + // sample depth if in bound, else 0 (0 for reverse-z projection, use 1 if normal projection) + float d0 = p.x < size.x && p.y < size.y ? texelFetch(samplerDepth, p + ivec2(0, 0), 0).x : 0.0; + float d1 = p.x + 1 < size.x && p.y < size.y ? texelFetch(samplerDepth, p + ivec2(1, 0), 0).x : 0.0; + float d2 = p.x < size.x && p.y + 1 < size.y ? texelFetch(samplerDepth, p + ivec2(0, 1), 0).x : 0.0; + float d3 = p.x + 1 < size.x && p.y + 1 < size.y ? texelFetch(samplerDepth, p + ivec2(1, 1), 0).x : 0.0; + + // store to mip 0 + if (p.x < size.x && p.y < size.y) imageStore(outImage[0], p + ivec2(0, 0), vec4(d0)); + if (p.x + 1 < size.x && p.y < size.y) imageStore(outImage[0], p + ivec2(1, 0), vec4(d1)); + if (p.x < size.x && p.y + 1 < size.y) imageStore(outImage[0], p + ivec2(0, 1), vec4(d2)); + if (p.x + 1 < size.x && p.y + 1 < size.y) imageStore(outImage[0], p + ivec2(1, 1), vec4(d3)); + + return AF4(d0, d1, d2, d3); +} + +AF4 SpdLoad(ASU2 p, AU1 slice) { + uint loadMip = min(6u, MIPS - 1); + float d = imageLoad(outImage[loadMip], p).r; + return AF4(d, d, d, d); +} + +void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice) { + if ( mip + 1 < MIPS ) { + imageStore(outImage[mip + 1], p, vec4(value.x)); + } +} + +AF4 SpdLoadIntermediate(AU1 x, AU1 y) { + float d = spd_intermediate[x][y]; + return AF4(d, d, d, d); +} +void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value) { spd_intermediate[x][y] = value.x; } +void SpdIncreaseAtomicCounter(AU1 slice) { spd_counter = atomicAdd(spdCounter.counter, 1); } +AU1 SpdGetAtomicCounter() { return spd_counter; } +void SpdResetAtomicCounter(AU1 slice) { spdCounter.counter = 0; } + +// min filter +AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3) { + float minVal = min(min(v0.x, v1.x), min(v2.x, v3.x)); + return AF4(minVal, minVal, minVal, minVal); +} + +#include "../../ext/ffx_spd.h" void main() { - int mip = int(PushConstant.pass); - - float depth; - ivec2 pos = ivec2(gl_GlobalInvocationID.xy); - if ( mip == 0 ) { - depth = texelFetch(samplerDepth, pos, 0).r; - } else { - depth = texture(inImage[mip - 1], (vec2(gl_GlobalInvocationID.xy) + vec2(0.5)) / imageSize( outImage[mip] )).x; - } - - imageStore(outImage[mip], pos, vec4(depth)); + SpdDownsample( + AU2(gl_WorkGroupID.xy), + AU1(gl_LocalInvocationIndex), + AU1(PushConstant_.mips - 1), + AU1(PushConstant_.numWorkGroups), + AU1(PushConstant_.workGroupOffset) + ); } \ No newline at end of file diff --git a/bin/data/shaders/ext/ffx_a.h b/bin/data/shaders/ext/ffx_a.h new file mode 100644 index 00000000..0a7cc01d --- /dev/null +++ b/bin/data/shaders/ext/ffx_a.h @@ -0,0 +1,1907 @@ +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// [A] SHADER PORTABILITY 1.20190530 +// +//============================================================================================================================== +// LICENSE +// ======= +// Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) <2014> +// ------- +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// ------- +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// ------- +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// Common central point for high-level shading language and C portability for various shader headers. +//------------------------------------------------------------------------------------------------------------------------------ +// DEFINES +// ======= +// A_CPU ..... Include the CPU related code. +// A_GPU ..... Include the GPU related code. +// A_GLSL .... Using GLSL. +// A_HLSL .... Using HLSL. +// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default). +// ======= +// A_BYTE .... Support 8-bit integer. +// A_HALF .... Support 16-bit integer and floating point. +// A_LONG .... Support 64-bit integer. +// A_DUBL .... Support 64-bit floating point. +// ======= +// A_WAVE .... Support wave-wide operations. +//------------------------------------------------------------------------------------------------------------------------------ +// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'. +//------------------------------------------------------------------------------------------------------------------------------ +// SIMPLIFIED TYPE SYSTEM +// ====================== +// - All ints will be unsigned with exception of when signed is required. +// - Type naming simplified and shortened "A<#components>", +// - H = 16-bit float (half) +// - F = 32-bit float (float) +// - D = 64-bit float (double) +// - P = 1-bit integer (predicate, not using bool because 'B' is used for byte) +// - B = 8-bit integer (byte) +// - W = 16-bit integer (word) +// - U = 32-bit integer (unsigned) +// - L = 64-bit integer (long) +// - Using "AS<#components>" for signed when required. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops). +// - Add subgroup ops. +//------------------------------------------------------------------------------------------------------------------------------ +// CHANGE LOG +// ========== +// 20190531 - Fixed changed to llabs() because long is int on Windows. +// 20190530 - Updated for new CPU/GPU portability. +// 20190528 - Fix AU1_AH2_x() on HLSL (had incorrectly swapped x and y), fixed asuint() cases. +// 20190527 - Added min3/max3 for low precision for HLSL. +// 20190526 - Updated with half approximations, added ARsq*(), and ASat*() for CPU. +// 20190519 - Added more approximations. +// 20190514 - Added long conversions. +// 20190513 - Added the real BFI moved the other one to ABfiM(). +// 20190507 - Added extra remap useful for 2D reductions. +// 20190507 - Started adding wave ops, add parabolic sin/cos. +// 20190505 - Added ASigned*() and friends, setup more auto-typecast, GLSL extensions, etc. +// 20190504 - Added min3/max3 for 32-bit integers. +// 20190503 - Added type reinterpretation for half. +// 20190416 - Added min3/max3 for half. +// 20190405 - Misc bug fixing. +// 20190404 - Cleaned up color conversion code. Switched "splat" to shorter naming "type_". Misc bug fixing. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COMMON +//============================================================================================================================== +#define A_2PI 6.28318530718 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// CPU +// +// +//============================================================================================================================== +// Requires standard C types: stdint.h +// Requires a collection of standard math intrinsics. +// - Requires VS2013 when not using GCC to get exp2() and log2(). +// - https://blogs.msdn.microsoft.com/vcblog/2013/07/19/c99-library-support-in-visual-studio-2013/ +//------------------------------------------------------------------------------------------------------------------------------ +// This provides a minimum subset of functionality compared to the GPU parts. +//============================================================================================================================== +#ifdef A_CPU + // Supporting user defined overrides. + #ifndef A_RESTRICT + #define A_RESTRICT __restrict + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifndef A_STATIC + #define A_STATIC static + #endif +//------------------------------------------------------------------------------------------------------------------------------ + // Same types across CPU and GPU. + // Predicate uses 32-bit integer (C friendly bool). + typedef uint32_t AP1; + typedef float AF1; + typedef double AD1; + typedef uint8_t AB1; + typedef uint16_t AW1; + typedef uint32_t AU1; + typedef uint64_t AL1; + typedef int8_t ASB1; + typedef int16_t ASW1; + typedef int32_t ASU1; + typedef int64_t ASL1; +//------------------------------------------------------------------------------------------------------------------------------ + #define AD1_(a) ((AD1)(a)) + #define AF1_(a) ((AF1)(a)) + #define AL1_(a) ((AL1)(a)) + #define AU1_(a) ((AU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1_(a) ((ASL1)(a)) + #define ASU1_(a) ((ASU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;} +//------------------------------------------------------------------------------------------------------------------------------ + #define A_TRUE 1 + #define A_FALSE 0 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// CPU/GPU PORTING +// +//------------------------------------------------------------------------------------------------------------------------------ +// Hackary to get CPU and GPU to share all setup code, without duplicate code paths. +// Unfortunately this is the level of "ugly" that is required since the languages are very different. +// This uses a lower-case prefix for special vector constructs. +// - In C restrict pointers are used. +// - In the shading language, in/inout/out arguments are used. +// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]). +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD1 *A_RESTRICT + #define retAD3 AD1 *A_RESTRICT + #define retAD4 AD1 *A_RESTRICT + #define retAF2 AF1 *A_RESTRICT + #define retAF3 AF1 *A_RESTRICT + #define retAF4 AF1 *A_RESTRICT + #define retAL2 AL1 *A_RESTRICT + #define retAL3 AL1 *A_RESTRICT + #define retAL4 AL1 *A_RESTRICT + #define retAU2 AU1 *A_RESTRICT + #define retAU3 AU1 *A_RESTRICT + #define retAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 AD1 *A_RESTRICT + #define inAD3 AD1 *A_RESTRICT + #define inAD4 AD1 *A_RESTRICT + #define inAF2 AF1 *A_RESTRICT + #define inAF3 AF1 *A_RESTRICT + #define inAF4 AF1 *A_RESTRICT + #define inAL2 AL1 *A_RESTRICT + #define inAL3 AL1 *A_RESTRICT + #define inAL4 AL1 *A_RESTRICT + #define inAU2 AU1 *A_RESTRICT + #define inAU3 AU1 *A_RESTRICT + #define inAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 AD1 *A_RESTRICT + #define inoutAD3 AD1 *A_RESTRICT + #define inoutAD4 AD1 *A_RESTRICT + #define inoutAF2 AF1 *A_RESTRICT + #define inoutAF3 AF1 *A_RESTRICT + #define inoutAF4 AF1 *A_RESTRICT + #define inoutAL2 AL1 *A_RESTRICT + #define inoutAL3 AL1 *A_RESTRICT + #define inoutAL4 AL1 *A_RESTRICT + #define inoutAU2 AU1 *A_RESTRICT + #define inoutAU3 AU1 *A_RESTRICT + #define inoutAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 AD1 *A_RESTRICT + #define outAD3 AD1 *A_RESTRICT + #define outAD4 AD1 *A_RESTRICT + #define outAF2 AF1 *A_RESTRICT + #define outAF3 AF1 *A_RESTRICT + #define outAF4 AF1 *A_RESTRICT + #define outAL2 AL1 *A_RESTRICT + #define outAL3 AL1 *A_RESTRICT + #define outAL4 AL1 *A_RESTRICT + #define outAU2 AU1 *A_RESTRICT + #define outAU3 AU1 *A_RESTRICT + #define outAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD1 x[2] + #define varAD3(x) AD1 x[3] + #define varAD4(x) AD1 x[4] + #define varAF2(x) AF1 x[2] + #define varAF3(x) AF1 x[3] + #define varAF4(x) AF1 x[4] + #define varAL2(x) AL1 x[2] + #define varAL3(x) AL1 x[3] + #define varAL4(x) AL1 x[4] + #define varAU2(x) AU1 x[2] + #define varAU3(x) AU1 x[3] + #define varAU4(x) AU1 x[4] +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) {x,y} + #define initAD3(x,y,z) {x,y,z} + #define initAD4(x,y,z,w) {x,y,z,w} + #define initAF2(x,y) {x,y} + #define initAF3(x,y,z) {x,y,z} + #define initAF4(x,y,z,w) {x,y,z,w} + #define initAL2(x,y) {x,y} + #define initAL3(x,y,z) {x,y,z} + #define initAL4(x,y,z,w) {x,y,z,w} + #define initAU2(x,y) {x,y} + #define initAU3(x,y,z) {x,y,z} + #define initAU4(x,y,z,w) {x,y,z,w} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Replace transcendentals with manual versions. +//============================================================================================================================== + #ifdef A_GCC + A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_labs(ASL1_(a)));} + #else + A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(llabs(ASL1_(a)));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);} + #else + A_STATIC AD1 ACosD1(AD1 a){return cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return cosf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} + A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);} + #else + A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);} + #else + A_STATIC AD1 AFloorD1(AD1 a){return floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);} + A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);} + #else + A_STATIC AD1 ALog2D1(AD1 a){return log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;} + A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;} + A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;} + A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + // These follow the convention that A integer types don't have signage, until they are operated on. + A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;} + A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a>ASL1_(b));} + A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);} + #else + A_STATIC AD1 ASinD1(AD1 a){return sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return sinf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);} + #else + A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);} + A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));} + A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));} + A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));} + A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;} + A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;} + A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;} + A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;} + A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;} + A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;} + A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;} + A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;} + A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;} + A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;} + A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;} + A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;} + A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;} + A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;} + A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;} + A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;} + A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF FLOAT PACKING +//============================================================================================================================== + // Convert float to half (in lower 16-bits of output). + // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + // Supports denormals. + // Conversion rules are to make computations possibly "safer" on the GPU, + // -INF & -NaN -> -65504 + // +INF & +NaN -> +65504 + A_STATIC AU1 AU1_AH1_AF1(AF1 f){ + static AW1 base[512]={ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100, + 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00, + 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100, + 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00, + 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff}; + static AB1 shift[512]={ + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18}; + union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);} +//------------------------------------------------------------------------------------------------------------------------------ + // Used to output packed constant. + A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GLSL +// +// +//============================================================================================================================== +#if defined(A_GLSL) && defined(A_GPU) + #ifndef A_SKIP_EXT + #ifdef A_HALF + #extension GL_EXT_shader_16bit_storage:require + #extension GL_EXT_shader_explicit_arithmetic_types:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_LONG + #extension GL_ARB_gpu_shader_int64:require + // TODO: Fixme to more portable extension!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + #extension GL_NV_shader_atomic_int64:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_WAVE + #extension GL_KHR_shader_subgroup_arithmetic:require + #extension GL_KHR_shader_subgroup_ballot:require + #extension GL_KHR_shader_subgroup_quad:require + #extension GL_KHR_shader_subgroup_shuffle:require + #endif + #endif +//============================================================================================================================== + #define AP1 bool + #define AP2 bvec2 + #define AP3 bvec3 + #define AP4 bvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 vec2 + #define AF3 vec3 + #define AF4 vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uvec2 + #define AU3 uvec3 + #define AU4 uvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 ivec2 + #define ASU3 ivec3 + #define ASU4 ivec4 +//============================================================================================================================== + #define AF1_AU1(x) uintBitsToFloat(AU1(x)) + #define AF2_AU2(x) uintBitsToFloat(AU2(x)) + #define AF3_AU3(x) uintBitsToFloat(AU3(x)) + #define AF4_AU4(x) uintBitsToFloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) floatBitsToUint(AF1(x)) + #define AU2_AF2(x) floatBitsToUint(AF2(x)) + #define AU3_AF3(x) floatBitsToUint(AF3(x)) + #define AU4_AF4(x) floatBitsToUint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2_AF2 packHalf2x16 + #define AU1_AW2Unorm_AF2 packUnorm2x16 + #define AU1_AB4Unorm_AF4 packUnorm4x8 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF2_AH2_AU1 unpackHalf2x16 + #define AF2_AW2Unorm_AU1 unpackUnorm2x16 + #define AF4_AB4Unorm_AU1 unpackUnorm4x8 +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #define AB1 uint8_t + #define AB2 u8vec2 + #define AB3 u8vec3 + #define AB4 u8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASB1 int8_t + #define ASB2 i8vec2 + #define ASB3 i8vec3 + #define ASB4 i8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + AB1 AB1_x(AB1 a){return AB1(a);} + AB2 AB2_x(AB1 a){return AB2(a,a);} + AB3 AB3_x(AB1 a){return AB3(a,a,a);} + AB4 AB4_x(AB1 a){return AB4(a,a,a,a);} + #define AB1_(a) AB1_x(AB1(a)) + #define AB2_(a) AB2_x(AB1(a)) + #define AB3_(a) AB3_x(AB1(a)) + #define AB4_(a) AB4_x(AB1(a)) + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #define AH1 float16_t + #define AH2 f16vec2 + #define AH3 f16vec3 + #define AH4 f16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 u16vec2 + #define AW3 u16vec3 + #define AW4 u16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 i16vec2 + #define ASW3 i16vec3 + #define ASW4 i16vec4 +//============================================================================================================================== + #define AH2_AU1(x) unpackFloat2x16(AU1(x)) + AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));} + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) unpackUint2x16(AU1(x)) + #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x))) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2(x) packFloat2x16(AH2(x)) + AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));} + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) packUint2x16(AW2(x)) + #define AU2_AW4(x) unpack32(packUint4x16(AW4(x))) +//============================================================================================================================== + #define AW1_AH1(x) halfBitsToUint16(AH1(x)) + #define AW2_AH2(x) halfBitsToUint16(AH2(x)) + #define AW3_AH3(x) halfBitsToUint16(AH3(x)) + #define AW4_AH4(x) halfBitsToUint16(AH4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AH1_AW1(x) uint16BitsToHalf(AW1(x)) + #define AH2_AW2(x) uint16BitsToHalf(AW2(x)) + #define AH3_AW3(x) uint16BitsToHalf(AW3(x)) + #define AH4_AW4(x) uint16BitsToHalf(AW4(x)) +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFractH1(AH1 x){return fract(x);} + AH2 AFractH2(AH2 x){return fract(x);} + AH3 AFractH3(AH3 x){return fract(x);} + AH4 AFractH4(AH4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of max3. + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of min3. + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;} + AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;} + AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;} + AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);} + AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);} + AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);} + AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));} + AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));} + AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));} + AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #define AD1 double + #define AD2 dvec2 + #define AD3 dvec3 + #define AD4 dvec4 +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 x){return fract(x);} + AD2 AFractD2(AD2 x){return fract(x);} + AD3 AFractD3(AD3 x){return fract(x);} + AD4 AFractD4(AD4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;} + AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;} + AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;} + AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);} + AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);} + AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);} + AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));} + AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));} + AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));} + AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL LONG +//============================================================================================================================== + #ifdef A_LONG + #define AL1 uint64_t + #define AL2 u64vec2 + #define AL3 u64vec3 + #define AL4 u64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1 int64_t + #define ASL2 i64vec2 + #define ASL3 i64vec3 + #define ASL4 i64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AL1_AU2(x) packUint2x32(AU2(x)) + #define AU2_AL1(x) unpackUint2x32(AL1(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AL1_x(AL1 a){return AL1(a);} + AL2 AL2_x(AL1 a){return AL2(a,a);} + AL3 AL3_x(AL1 a){return AL3(a,a,a);} + AL4 AL4_x(AL1 a){return AL4(a,a,a,a);} + #define AL1_(a) AL1_x(AL1(a)) + #define AL2_(a) AL2_x(AL1(a)) + #define AL3_(a) AL3_x(AL1(a)) + #define AL4_(a) AL4_x(AL1(a)) +//============================================================================================================================== + AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));} + AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));} + AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));} + AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));} + AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));} + AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));} + AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));} + AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));} + AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));} + AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// WAVE OPERATIONS +//============================================================================================================================== + #ifdef A_WAVE + AF1 AWaveAdd(AF1 v){return subgroupAdd(v);} + AF2 AWaveAdd(AF2 v){return subgroupAdd(v);} + AF3 AWaveAdd(AF3 v){return subgroupAdd(v);} + AF4 AWaveAdd(AF4 v){return subgroupAdd(v);} + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// HLSL +// +// +//============================================================================================================================== +#if defined(A_HLSL) && defined(A_GPU) + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 float2 + #define AF3 float3 + #define AF4 float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uint2 + #define AU3 uint3 + #define AU4 uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 int2 + #define ASU3 int3 + #define ASU4 int4 +//============================================================================================================================== + #define AF1_AU1(x) asfloat(AU1(x)) + #define AF2_AU2(x) asfloat(AU2(x)) + #define AF3_AU3(x) asfloat(AU3(x)) + #define AF4_AU4(x) asfloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) asuint(AF1(x)) + #define AU2_AF2(x) asuint(AF2(x)) + #define AU3_AF3(x) asuint(AF3(x)) + #define AU4_AF4(x) asuint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);} + #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) + #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));} + #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x)) +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1<>off)&mask;} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1<>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #define AH1 min16float + #define AH2 min16float2 + #define AH3 min16float3 + #define AH4 min16float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 min16uint + #define AW2 min16uint2 + #define AW3 min16uint3 + #define AW4 min16uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 min16int + #define ASW2 min16int2 + #define ASW3 min16int3 + #define ASW4 min16int4 +//============================================================================================================================== + // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). + // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ + AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);} + AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));} + AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);} + AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));} + #define AH2_AU1(x) AH2_AU1_x(AU1(x)) + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) AW2_AU1_x(AU1(x)) + #define AW4_AU2(x) AW4_AU2_x(AU2(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);} + AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));} + AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);} + AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));} + #define AU1_AH2(x) AU1_AH2_x(AH2(x)) + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) AU1_AW2_x(AW2(x)) + #define AU2_AW4(x) AU2_AW4_x(AW4(x)) +//============================================================================================================================== + // TODO: These are broken!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + #define AW1_AH1(x) AW1(asuint(AF1(x))) + #define AW2_AH2(x) AW2(asuint(AF2(x))) + #define AW3_AH3(x) AW3(asuint(AF3(x))) + #define AW4_AH4(x) AW4(asuint(AF4(x))) +//------------------------------------------------------------------------------------------------------------------------------ + // TODO: These are broken!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + #define AH1_AW1(x) AH1(asfloat(AU1(x))) + #define AH2_AW2(x) AH2(asfloat(AU2(x))) + #define AH3_AW3(x) AH3(asfloat(AU3(x))) + #define AH4_AW4(x) AH4(asfloat(AU4(x))) +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_FRACT_F16 (note DX frac() is different). + AH1 AFractH1(AH1 x){return x-floor(x);} + AH2 AFractH2(AH2 x){return x-floor(x);} + AH3 AFractH3(AH3 x){return x-floor(x);} + AH4 AFractH4(AH4 x){return x-floor(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return rcp(x);} + AH2 ARcpH2(AH2 x){return rcp(x);} + AH3 ARcpH3(AH3 x){return rcp(x);} + AH4 ARcpH4(AH4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return rsqrt(x);} + AH2 ARsqH2(AH2 x){return rsqrt(x);} + AH3 ARsqH3(AH3 x){return rsqrt(x);} + AH4 ARsqH4(AH4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return saturate(x);} + AH2 ASatH2(AH2 x){return saturate(x);} + AH3 ASatH3(AH3 x){return saturate(x);} + AH4 ASatH4(AH4 x){return saturate(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #define AD1 double + #define AD2 double2 + #define AD3 double3 + #define AD4 double4 +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 a){return a-floor(a);} + AD2 AFractD2(AD2 a){return a-floor(a);} + AD3 AFractD3(AD3 a){return a-floor(a);} + AD4 AFractD4(AD4 a){return a-floor(a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return rcp(x);} + AD2 ARcpD2(AD2 x){return rcp(x);} + AD3 ARcpD3(AD3 x){return rcp(x);} + AD4 ARcpD4(AD4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return rsqrt(x);} + AD2 ARsqD2(AD2 x){return rsqrt(x);} + AD3 ARsqD3(AD3 x){return rsqrt(x);} + AD4 ARsqD4(AD4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return saturate(x);} + AD2 ASatD2(AD2 x){return saturate(x);} + AD3 ASatD3(AD3 x){return saturate(x);} + AD4 ASatD4(AD4 x){return saturate(x);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL LONG +//============================================================================================================================== + #ifdef A_LONG + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU COMMON +// +// +//============================================================================================================================== +#ifdef A_GPU + // Negative and positive infinity. + #define A_INFN_F AF1_AU1(0x7f800000u) + #define A_INFP_F AF1_AU1(0xff800000u) +//------------------------------------------------------------------------------------------------------------------------------ + // Copy sign from 's' to positive 'd'. + AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));} + AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));} + AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));} + AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Single operation to return (useful to create a mask to use in lerp for branch free logic), + // m=NaN := 0 + // m>=0 := 0 + // m<0 := 1 + // Uses the following useful floating point logic, + // saturate(+a*(-INF)==-INF) := 0 + // saturate( 0*(-INF)== NaN) := 0 + // saturate(-a*(-INF)==+INF) := 1 + AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));} + AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));} + AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));} + AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));} +//============================================================================================================================== + #ifdef A_HALF + #define A_INFN_H AH1_AW1(0x7c00u) + #define A_INFP_H AH1_AW1(0xfc00u) +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));} + AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));} + AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));} + AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));} + AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));} + AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));} + AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These support only positive inputs. +// Did not see value yet in specialization for range. +// Using quick testing, ended up mostly getting the same "best" approximation for various ranges. +// With hardware that can co-execute transcendentals, the value in approximations could be less than expected. +// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total. +// And co-execution would require a compiler interleaving a lot of independent work for packed usage. +//------------------------------------------------------------------------------------------------------------------------------ +// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total). +// Same with sqrt(), as this could be x*rsq() (7 ops). +//------------------------------------------------------------------------------------------------------------------------------ +// IDEAS +// ===== +// - Polaris hardware has 16-bit support, but non-double rate. +// Could be possible still get part double rate for some of this logic, +// by clearing out the lower half's sign when necessary and using 32-bit ops... +//============================================================================================================================== + #ifdef A_HALF + // Minimize squared error across full positive range, 2 ops. + // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output. + AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));} + AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));} +//------------------------------------------------------------------------------------------------------------------------------ + // Lower precision estimation, 1 op. + // Minimize squared error across {smallest normal to 16384.0}. + AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));} + AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));} +//------------------------------------------------------------------------------------------------------------------------------ + // Medium precision estimation, one Newton Raphson iteration, 3 ops. + AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));} + AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // Minimize squared error across {smallest normal to 16384.0}, 2 ops. + AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));} + AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// FLOAT APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN", +// - Idea dates back to SGI, then to Quake 3, etc. +// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +// - sqrt(x)=rsqrt(x)*x +// - rcp(x)=rsqrt(x)*rsqrt(x) for positive x +// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +//------------------------------------------------------------------------------------------------------------------------------ +// These below are from perhaps less complete searching for optimal. +// Used FP16 normal range for testing with +4096 32-bit step size for sampling error. +// So these match up well with the half approximations. +//============================================================================================================================== + AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));} + AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));} + AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));} + AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PARABOLIC SIN & COS +//------------------------------------------------------------------------------------------------------------------------------ +// Approximate answers to transcendental questions. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Verify packed math ABS is correctly doing an AND. +//============================================================================================================================== + // Valid input range is {-1 to 1} representing {0 to 2 pi}. + // Output range is {-1/4 to -1/4} representing {-1 to 1}. + AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD. + AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + // For a packed {sin,cos} pair, + // - Native takes 16 clocks and 4 issue slots (no packed transcendentals). + // - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed). + AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA + AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COLOR CONVERSIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These are all linear to/from some other space (where 'linear' has been shortened out of the function name). +// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'. +// These are branch free implementations. +// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion. +//------------------------------------------------------------------------------------------------------------------------------ +// TRANSFER FUNCTIONS +// ================== +// 709 ..... Rec709 used for some HDTVs +// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native +// Pq ...... PQ native for HDR10 +// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type +// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations) +//------------------------------------------------------------------------------------------------------------------------------ +// FOR PQ +// ====== +// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2. +// All constants are only specified to FP32 precision. +// External PQ source reference, +// - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl +//------------------------------------------------------------------------------------------------------------------------------ +// PACKED VERSIONS +// =============== +// These are the A*H2() functions. +// There is no PQ functions as FP16 seemed to not have enough precision for the conversion. +// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors. +// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least). +//------------------------------------------------------------------------------------------------------------------------------ +// NOTES +// ===== +// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case. +//============================================================================================================================== + AF1 ATo709F1(AF1 c){return max(min(c*AF1_(4.5),AF1_(0.018)),AF1_(1.099)*pow(c,AF1_(0.45))-AF1_(0.099));} +//------------------------------------------------------------------------------------------------------------------------------ + // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma(). + AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,rcpX);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302)); + return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToSrgbF1(AF1 c){return max(min(c*AF1_(12.92),AF1_(0.0031308)),AF1_(1.055)*pow(c,AF1_(0.41666))-AF1_(0.055));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToTwoF1(AF1 c){return sqrt(c);} +//============================================================================================================================== + AF1 AFrom709F1(AF1 c){return max(min(c*AF1_(1.0/4.5),AF1_(0.081)), + pow((c+AF1_(0.099))*(AF1_(1.0)/(AF1_(1.099))),AF1_(1.0/0.45)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833)); + return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromSrgbF1(AF1 c){return max(min(c*AF1_(1.0/12.92),AF1_(0.04045)), + pow((c+AF1_(0.055))*(AF1_(1.0)/AF1_(1.055)),AF1_(2.4)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromTwoF1(AF1 c){return c*c;} +//============================================================================================================================== + #ifdef A_HALF + AH2 ATo709H2(AH2 c){return max(min(c*AH2_(4.5),AH2_(0.018)),AH2_(1.099)*pow(c,AH2_(0.45))-AH2_(0.099));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 AToSrgbH2(AH2 c){return max(min(c*AH2_(12.92),AH2_(0.0031308)),AH2_(1.055)*pow(c,AH2_(0.41666))-AH2_(0.055));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 AToTwoH2(AH2 c){return sqrt(c);} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH2 AFrom709H2(AH2 c){return max(min(c*AH2_(1.0/4.5),AH2_(0.081)), + pow((c+AH2_(0.099))*(AH2_(1.0)/(AH2_(1.099))),AH2_(1.0/0.45)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 AFromSrgbH2(AH2 c){return max(min(c*AH2_(1.0/12.92),AH2_(0.04045)), + pow((c+AH2_(0.055))*(AH2_(1.0)/AH2_(1.055)),AH2_(2.4)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 AFromTwoH2(AH2 c){return c*c;} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CS REMAP +//============================================================================================================================== + // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear. + // 543210 + // ====== + // ..xxx. + // yy...y + AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} +//============================================================================================================================== + // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions. + // 543210 + // ====== + // .xx..x + // y..yy. + // Details, + // LANE TO 8x8 MAPPING + // =================== + // 00 01 08 09 10 11 18 19 + // 02 03 0a 0b 12 13 1a 1b + // 04 05 0c 0d 14 15 1c 1d + // 06 07 0e 0f 16 17 1e 1f + // 20 21 28 29 30 31 38 39 + // 22 23 2a 2b 32 33 3a 3b + // 24 25 2c 2d 34 35 3c 3d + // 26 27 2e 2f 36 37 3e 3f + AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// REFERENCE +// +//------------------------------------------------------------------------------------------------------------------------------ +// IEEE FLOAT RULES +// ================ +// - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1 +// - {+/-}0 * {+/-}INF = NaN +// - -INF + (+INF) = NaN +// - {+/-}0 / {+/-}0 = NaN +// - {+/-}INF / {+/-}INF = NaN +// - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN) +// - 0 == -0 +// - 4/0 = +INF +// - 4/-0 = -INF +// - 4+INF = +INF +// - 4-INF = -INF +// - 4*(+INF) = +INF +// - 4*(-INF) = -INF +// - -4*(+INF) = -INF +// - sqrt(+INF) = +INF +//------------------------------------------------------------------------------------------------------------------------------ +// FP16 ENCODING +// ============= +// fedcba9876543210 +// ---------------- +// ......mmmmmmmmmm 10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals) +// .eeeee.......... 5-bit exponent +// .00000.......... denormals +// .00001.......... -14 exponent +// .11110.......... 15 exponent +// .111110000000000 infinity +// .11111nnnnnnnnnn NaN with n!=0 +// s............... sign +//------------------------------------------------------------------------------------------------------------------------------ +// FP16/INT16 ALIASING DENORMAL +// ============================ +// 11-bit unsigned integers alias with half float denormal/normal values, +// 1 = 2^(-24) = 1/16777216 ....................... first denormal value +// 2 = 2^(-23) +// ... +// 1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value +// 1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers +// 2047 .............................................. last normal value that still maps to integers +// Scaling limits, +// 2^15 = 32768 ...................................... largest power of 2 scaling +// Largest pow2 conversion mapping is at *32768, +// 1 : 2^(-9) = 1/128 +// 1024 : 8 +// 2047 : a little less than 16 +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU/CPU PORTABILITY +// +// +//------------------------------------------------------------------------------------------------------------------------------ +// This is the GPU implementation. +// See the CPU implementation for docs. +//============================================================================================================================== +#ifdef A_GPU + #define A_TRUE true + #define A_FALSE false + #define A_STATIC +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD2 + #define retAD3 AD3 + #define retAD4 AD4 + #define retAF2 AF2 + #define retAF3 AF3 + #define retAF4 AF4 + #define retAL2 AL2 + #define retAL3 AL3 + #define retAL4 AL4 + #define retAU2 AU2 + #define retAU3 AU3 + #define retAU4 AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 in AD2 + #define inAD3 in AD3 + #define inAD4 in AD4 + #define inAF2 in AF2 + #define inAF3 in AF3 + #define inAF4 in AF4 + #define inAL2 in AL2 + #define inAL3 in AL3 + #define inAL4 in AL4 + #define inAU2 in AU2 + #define inAU3 in AU3 + #define inAU4 in AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 inout AD2 + #define inoutAD3 inout AD3 + #define inoutAD4 inout AD4 + #define inoutAF2 inout AF2 + #define inoutAF3 inout AF3 + #define inoutAF4 inout AF4 + #define inoutAL2 inout AL2 + #define inoutAL3 inout AL3 + #define inoutAL4 inout AL4 + #define inoutAU2 inout AU2 + #define inoutAU3 inout AU3 + #define inoutAU4 inout AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 out AD2 + #define outAD3 out AD3 + #define outAD4 out AD4 + #define outAF2 out AF2 + #define outAF3 out AF3 + #define outAF4 out AF4 + #define outAL2 out AL2 + #define outAL3 out AL3 + #define outAL4 out AL4 + #define outAU2 out AU2 + #define outAU3 out AU3 + #define outAU4 out AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD2 x + #define varAD3(x) AD3 x + #define varAD4(x) AD4 x + #define varAF2(x) AF2 x + #define varAF3(x) AF3 x + #define varAF4(x) AF4 x + #define varAL2(x) AL2 x + #define varAL3(x) AL3 x + #define varAL4(x) AL4 x + #define varAU2(x) AU2 x + #define varAU3(x) AU3 x + #define varAU4(x) AU4 x +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) AD2(x,y) + #define initAD3(x,y,z) AD3(x,y,z) + #define initAD4(x,y,z,w) AD4(x,y,z,w) + #define initAF2(x,y) AF2(x,y) + #define initAF3(x,y,z) AF3(x,y,z) + #define initAF4(x,y,z,w) AF4(x,y,z,w) + #define initAL2(x,y) AL2(x,y) + #define initAL3(x,y,z) AL3(x,y,z) + #define initAL4(x,y,z,w) AL4(x,y,z,w) + #define initAU2(x,y) AU2(x,y) + #define initAU3(x,y,z) AU3(x,y,z) + #define initAU4(x,y,z,w) AU4(x,y,z,w) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//============================================================================================================================== + #define AAbsD1(a) abs(AD1(a)) + #define AAbsF1(a) abs(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ACosD1(a) cos(AD1(a)) + #define ACosF1(a) cos(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ADotD2(a,b) dot(AD2(a),AD2(b)) + #define ADotD3(a,b) dot(AD3(a),AD3(b)) + #define ADotD4(a,b) dot(AD4(a),AD4(b)) + #define ADotF2(a,b) dot(AF2(a),AF2(b)) + #define ADotF3(a,b) dot(AF3(a),AF3(b)) + #define ADotF4(a,b) dot(AF4(a),AF4(b)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AExp2D1(a) exp2(AD1(a)) + #define AExp2F1(a) exp2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AFloorD1(a) floor(AD1(a)) + #define AFloorF1(a) floor(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ALog2D1(a) log2(AD1(a)) + #define ALog2F1(a) log2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMaxD1(a,b) min(a,b) + #define AMaxF1(a,b) min(a,b) + #define AMaxL1(a,b) min(a,b) + #define AMaxU1(a,b) min(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMinD1(a,b) min(a,b) + #define AMinF1(a,b) min(a,b) + #define AMinL1(a,b) min(a,b) + #define AMinU1(a,b) min(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASinD1(a) sin(AD1(a)) + #define ASinF1(a) sin(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASqrtD1(a) sqrt(AD1(a)) + #define ASqrtF1(a) sqrt(AF1(a)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + #define APowD1(a,b) pow(AD1(a),AF1(b)) + #define APowF1(a,b) pow(AF1(a),AF1(b)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + #ifdef A_DUBL + AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;} + AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;} + AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;} + AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;} + AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;} + AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;} + AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;} + AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;} + AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;} + AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;} + AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;} + AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;} + AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;} + AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;} + AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;} + AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;} + AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;} + AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;} + AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;} + AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;} + AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;} + AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;} + AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;} + #endif +//============================================================================================================================== + AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;} + AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;} + AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;} + AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;} + AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;} + AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;} + AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;} + AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;} + AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;} + AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;} + AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;} + AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;} + AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;} + AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;} + AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;} + AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;} + AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;} + AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;} + AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;} + AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;} + AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;} + AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;} + AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;} +#endif \ No newline at end of file diff --git a/bin/data/shaders/ext/ffx_spd.h b/bin/data/shaders/ext/ffx_spd.h new file mode 100644 index 00000000..dc167261 --- /dev/null +++ b/bin/data/shaders/ext/ffx_spd.h @@ -0,0 +1,1297 @@ +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// [FFX SPD] Single Pass Downsampler 2.0 +// +//============================================================================================================================== +// LICENSE +// ======= +// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved. +// ------- +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// ------- +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// ------- +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// +//------------------------------------------------------------------------------------------------------------------------------ +// CHANGELIST v2.0 +// =============== +// - Added support for cube and array textures. SpdDownsample and SpdDownsampleH shader functions now take index of texture slice +// as an additional parameter. For regular texture use 0. +// - Added support for updating only sub-rectangle of the texture. Additional, optional parameter workGroupOffset added to shader +// functions SpdDownsample and SpdDownsampleH. +// - Added C function SpdSetup that helps to setup constants to be passed as a constant buffer. +// - The global atomic counter is automatically reset to 0 by the shader at the end, so you do not need to clear it before every +// use, just once after creation +// +//------------------------------------------------------------------------------------------------------------------------------ +// INTEGRATION SUMMARY FOR CPU +// =========================== +// // you need to provide as constants: +// // number of mip levels to be computed (maximum is 12) +// // number of total thread groups: ((widthInPixels+63)>>6) * ((heightInPixels+63)>>6) +// // workGroupOffset -> by default 0, if you only downsample a rectancle within the source texture use SpdSetup function to calculate correct offset +// ... +// // Dispatch the shader such that each thread group works on a 64x64 sub-tile of the source image +// // for Cube Textures or Texture2DArray, use the z dimension +// vkCmdDispatch(cmdBuf,(widthInPixels+63)>>6,(heightInPixels+63)>>6, slices); + +// // you can also use the SpdSetup function: +// //on top of your cpp file: +// #define A_CPU +// #include "ffx_a.h" +// #include "ffx_spd.h" +// // before your dispatch call, use SpdSetup function to get your constants +// varAU2(dispatchThreadGroupCountXY); // output variable +// varAU2(workGroupOffset); // output variable, this constants are required if Left and Top are not 0,0 +// varAU2(numWorkGroupsAndMips); // output variable +// // input information about your source texture: +// // left and top of the rectancle within your texture you want to downsample +// // width and height of the rectancle you want to downsample +// // if complete source texture should get downsampled: left = 0, top = 0, width = sourceTexture.width, height = sourceTexture.height +// varAU4(rectInfo) = initAU4(0, 0, m_Texture.GetWidth(), m_Texture.GetHeight()); // left, top, width, height +// SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo); +// ... +// // constants: +// data.numWorkGroupsPerSlice = numWorkGroupsAndMips[0]; +// data.mips = numWorkGroupsAndMips[1]; +// data.workGroupOffset[0] = workGroupOffset[0]; +// data.workGroupOffset[1] = workGroupOffset[1]; +// ... +// uint32_t dispatchX = dispatchThreadGroupCountXY[0]; +// uint32_t dispatchY = dispatchThreadGroupCountXY[1]; +// uint32_t dispatchZ = m_CubeTexture.GetArraySize(); // slices - for 2D Texture this is 1, for cube texture 6 +// vkCmdDispatch(cmd_buf, dispatchX, dispatchY, dispatchZ); + +//------------------------------------------------------------------------------------------------------------------------------ +// INTEGRATION SUMMARY FOR GPU +// =========================== + +// [SAMPLER] - if you want to use a sampler with linear filtering for loading the source image +// follow additionally the instructions marked with [SAMPLER] +// add following define: +// #define SPD_LINEAR_SAMPLER +// this is recommended, as using one sample() with linear filter to reduce 2x2 is faster +// than 4x load() plus manual averaging + +// // Setup layout. Example below for VK_FORMAT_R16G16B16A16_SFLOAT. +// // Note: If you use SRGB format for UAV load() and store() (if it's supported), you need to convert to and from linear space +// // when using UAV load() and store() +// // approximate conversion to linear (load function): x*x +// // approximate conversion from linear (store function): sqrt() +// // or use more accurate functions from ffx_a.h: AFromSrgbF1(value) and AToSrgbF1(value) +// // Recommendation: use UNORM format instead of SRGB for UAV access, and SRGB for SRV access +// // look in the sample app to see how it's done + +// // source image +// // if cube texture use image2DArray / Texture2DArray and adapt your load/store/sample calls +// GLSL: layout(set=0,binding=0,rgba16f)uniform image2D imgSrc; +// [SAMPLER]: layout(set=0,binding=0)uniform texture2D imgSrc; +// HLSL: [[vk::binding(0)]] Texture2D imgSrc :register(u0); + +// // destination -> 12 is the maximum number of mips supported by SPD +// GLSL: layout(set=0,binding=1,rgba16f) uniform coherent image2D imgDst[12]; +// HLSL: [[vk::binding(1)]] globallycoherent RWTexture2D imgDst[12] :register(u1); + +// // global atomic counter - MUST be initialized to 0 +// // SPD resets the counter back after each run by calling SpdResetAtomicCounter(slice) +// // if you have more than 1 slice (== if you downsample a cube texture or a texture2Darray) +// // you have an array of counters: counter[6] -> if you have 6 slices for example +// // GLSL: +// layout(std430, set=0, binding=2) coherent buffer SpdGlobalAtomicBuffer +// { +// uint counter; +// } spdGlobalAtomic; +// // HLSL: +// struct SpdGlobalAtomicBuffer +// { +// uint counter; +// }; +// [[vk::binding(2)]] globallycoherent RWStructuredBuffer spdGlobalAtomic; + +// // [SAMPLER] add sampler +// GLSL: layout(set=0, binding=3) uniform sampler srcSampler; +// HLSL: [[vk::binding(3)]] SamplerState srcSampler :register(s0); + +// // constants - either push constant or constant buffer +// // or calculate within shader +// // [SAMPLER] when using sampler add inverse source image size +// // GLSL: +// layout(push_constant) uniform SpdConstants { +// uint mips; // needed to opt out earlier if mips are < 12 +// uint numWorkGroups; // number of total thread groups, so numWorkGroupsX * numWorkGroupsY * 1 +// // it is important to NOT take the number of slices (z dimension) into account here +// // as each slice has its own counter! +// vec2 workGroupOffset; // optional - use SpdSetup() function to calculate correct workgroup offset +// } spdConstants; +// // HLSL: +// [[vk::push_constant]] +// cbuffer spdConstants { +// uint mips; +// uint numWorkGroups; +// float2 workGroupOffset; // optional +// }; + +// ... +// // Setup pre-portability-header defines (sets up GLSL/HLSL path, etc) +// #define A_GPU 1 +// #define A_GLSL 1 // or // #define A_HLSL 1 + +// // if you want to use PACKED version +// // recommended if bpc <= 16bit +// #define A_HALF + +// ... +// // Include the portability header (or copy it in without an include). +// #include "ffx_a.h" +// ... + +// // Define LDS variables +// shared AF4 spdIntermediate[16][16]; // HLSL: groupshared +// shared AU1 spdCounter; // HLSL: groupshared +// // PACKED version +// shared AH4 spdIntermediate[16][16]; // HLSL: groupshared +// // Note: You can also use +// shared AF1 spdIntermediateR[16][16]; +// shared AF1 spdIntermediateG[16][16]; +// shared AF1 spdIntermediateB[16][16]; +// shared AF1 spdIntermediateA[16][16]; +// // or for Packed version: +// shared AH2 spdIntermediateRG[16][16]; +// shared AH2 spdIntermediateBA[16][16]; +// // This is potentially faster +// // Adapt your load and store functions accordingly + +// // if subgroup operations are not supported / can't use SM6.0 +// #define SPD_NO_WAVE_OPERATIONS + +// // Define the fetch function(s) and the reduction function +// // if non-power-of-2 textures, add border controls to the load and store functions +// // to make sure the borders of the mip level look as you want it +// // if you don't add border controls you'll read zeros past the border +// // if you load with a sampler, this is obv. handled by your sampler :) +// // this is also the place where you need to do color space transformation if needed +// // E.g. if your texture format is SRGB/UNORM and you use the UAV load and store functions +// // no automatic to/from linear conversions are happening +// // there is to/from linear conversions when using a sampler and render target approach +// // conversion to linear (load function): x*x +// // conversion from linear (store function): sqrt() + +// AU1 slice parameter is for Cube textures and texture2DArray +// if downsampling Texture2D you can ignore this parameter, otherwise use it to access correct slice +// // Load from source image +// GLSL: AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){return imageLoad(imgSrc, p);} +// HLSL: AF4 SpdLoadSourceImage(ASU2 tex, AU1 slice){return imgSrc[tex];} +// [SAMPLER] don't forget to add the define #SPD_LINEAR_SAMPLER :) +// GLSL: +// AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){ +// AF2 textureCoord = p * invInputSize + invInputSize; +// return texture(sampler2D(imgSrc, srcSampler), textureCoord); +// } +// HLSL: +// AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){ +// AF2 textureCoord = p * invInputSize + invInputSize; +// return imgSrc.SampleLevel(srcSampler, textureCoord, 0); +// } + +// // SpdLoad() takes a 32-bit signed integer 2D coordinate and loads color. +// // Loads the 5th mip level, each value is computed by a different thread group +// // last thread group will access all its elements and compute the subsequent mips +// // reminder: if non-power-of-2 textures, add border controls if you do not want to read zeros past the border +// GLSL: AF4 SpdLoad(ASU2 p, AU1 slice){return imageLoad(imgDst[5],p);} +// HLSL: AF4 SpdLoad(ASU2 tex, AU1 slice){return imgDst[5][tex];} + +// Define the store function +// GLSL: void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice){imageStore(imgDst[mip], p, value);} +// HLSL: void SpdStore(ASU2 pix, AF4 value, AU1 mip, AU1 slice){imgDst[mip][pix] = value;} + +// // Define the atomic counter increase function +// // each slice only reads and stores to its specific slice counter +// // so, if you have several slices it's +// // InterlockedAdd(spdGlobalAtomic[0].counter[slice], 1, spdCounter); +// // GLSL: +// void SpdIncreaseAtomicCounter(AU1 slice){spdCounter = atomicAdd(spdGlobalAtomic.counter, 1);} +// AU1 SpdGetAtomicCounter() {return spdCounter;} +// void SpdResetAtomicCounter(AU1 slice){spdGlobalAtomic.counter[slice] = 0;} +// // HLSL: +// void SpdIncreaseAtomicCounter(AU1 slice){InterlockedAdd(spdGlobalAtomic[0].counter, 1, spdCounter);} +// AU1 SpdGetAtomicCounter(){return spdCounter;} +// void SpdResetAtomicCounter(AU1 slice){spdGlobalAtomic[0].counter[slice] = 0;} + +// // Define the LDS load and store functions +// // GLSL: +// AF4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];} +// void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){spdIntermediate[x][y] = value;} +// // HLSL: +// AF4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];} +// void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){spdIntermediate[x][y] = value;} + +// // Define your reduction function: takes as input the four 2x2 values and returns 1 output value +// Example below: computes the average value +// AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3){return (v0+v1+v2+v3)*0.25;} + +// // PACKED VERSION +// Load from source image +// GLSL: AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){return AH4(imageLoad(imgSrc, p));} +// HLSL: AH4 SpdLoadSourceImageH(ASU2 tex, AU1 slice){return AH4(imgSrc[tex]);} +// [SAMPLER] +// GLSL: +// AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){ +// AF2 textureCoord = p * invInputSize + invInputSize; +// return AH4(texture(sampler2D(imgSrc, srcSampler), textureCoord)); +// } +// HLSL: +// AH4 SpdLoadSourceImageH(ASU2 p, AU1 slice){ +// AF2 textureCoord = p * invInputSize + invInputSize; +// return AH4(imgSrc.SampleLevel(srcSampler, textureCoord, 0)); +// } + +// // SpdLoadH() takes a 32-bit signed integer 2D coordinate and loads color. +// // Loads the 5th mip level, each value is computed by a different thread group +// // last thread group will access all its elements and compute the subsequent mips +// GLSL: AH4 SpdLoadH(ASU2 p, AU1 slice){return AH4(imageLoad(imgDst[5],p));} +// HLSL: AH4 SpdLoadH(ASU2 tex, AU1 slice){return AH4(imgDst[5][tex]);} + +// Define the store function +// GLSL: void SpdStoreH(ASU2 p, AH4 value, AU1 mip, AU1 slice){imageStore(imgDst[mip], p, AF4(value));} +// HLSL: void SpdStoreH(ASU2 pix, AH4 value, AU1 index, AU1 slice){imgDst[index][pix] = AF4(value);} + +// // Define the atomic counter increase function +// // GLSL: +// void SpdIncreaseAtomicCounter(AU1 slice){spd_counter = atomicAdd(spdGlobalAtomic.counter, 1);} +// AU1 SpdGetAtomicCounter() {return spdCounter;} +// // HLSL: +// void SpdIncreaseAtomicCounter(AU1 slice){InterlockedAdd(spdGlobalAtomic[0].counter, 1, spdCounter);} +// AU1 SpdGetAtomicCounter(){return spdCounter;} + +// // Define the LDS load and store functions +// // GLSL: +// AH4 SpdLoadIntermediateH(AU1 x, AU1 y){return spdIntermediate[x][y];} +// void SpdStoreIntermediateH(AU1 x, AU1 y, AH4 value){spdIntermediate[x][y] = value;} +// // HLSL: +// AH4 SpdLoadIntermediate(AU1 x, AU1 y){return spdIntermediate[x][y];} +// void SpdStoreIntermediate(AU1 x, AU1 y, AH4 value){spdIntermediate[x][y] = value;} + +// // Define your reduction function: takes as input the four 2x2 values and returns 1 output value +// Example below: computes the average value +// AH4 SpdReduce4H(AH4 v0, AH4 v1, AH4 v2, AH4 v3){return (v0+v1+v2+v3)*AH1(0.25);} + +// // + +// // If you only use PACKED version +// #define SPD_PACKED_ONLY + +// // Include this SPD (single pass downsampler) header file (or copy it in without an include). +// #include "ffx_spd.h" +// ... + +// // Example in shader integration +// // GLSL: +// layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; +// void main(){ +// // Call the downsampling function +// // WorkGroupId.z should be 0 if you only downsample a Texture2D! +// SpdDownsample(AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex), +// AU1(spdConstants.mips), AU1(spdConstants.numWorkGroups), AU1(WorkGroupId.z)); +// +// // PACKED: +// SpdDownsampleH(AU2(gl_WorkGroupID.xy), AU1(gl_LocalInvocationIndex), +// AU1(spdConstants.mips), AU1(spdConstants.numWorkGroups), AU1(WorkGroupId.z)); +// ... +// // HLSL: +// [numthreads(256,1,1)] +// void main(uint3 WorkGroupId : SV_GroupID, uint LocalThreadIndex : SV_GroupIndex) { +// SpdDownsample(AU2(WorkGroupId.xy), AU1(LocalThreadIndex), +// AU1(mips), AU1(numWorkGroups), AU1(WorkGroupId.z)); +// +// // PACKED: +// SpdDownsampleH(AU2(WorkGroupId.xy), AU1(LocalThreadIndex), +// AU1(mips), AU1(numWorkGroups), AU1(WorkGroupId.z)); +// ... + +// +//------------------------------------------------------------------------------------------------------------------------------ + +//============================================================================================================================== +// SPD Setup +//============================================================================================================================== +#ifdef A_CPU +A_STATIC void SpdSetup( +outAU2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy +outAU2 workGroupOffset, // GPU side: pass in as constant +outAU2 numWorkGroupsAndMips, // GPU side: pass in as constant +inAU4 rectInfo, // left, top, width, height +ASU1 mips // optional: if -1, calculate based on rect width and height +){ + workGroupOffset[0] = rectInfo[0] / 64; // rectInfo[0] = left + workGroupOffset[1] = rectInfo[1] / 64; // rectInfo[1] = top + + AU1 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64; // rectInfo[0] = left, rectInfo[2] = width + AU1 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64; // rectInfo[1] = top, rectInfo[3] = height + + dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0]; + dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1]; + + numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]); + + if (mips >= 0) { + numWorkGroupsAndMips[1] = AU1(mips); + } else { // calculate based on rect width and height + AU1 resolution = AMaxU1(rectInfo[2], rectInfo[3]); + numWorkGroupsAndMips[1] = AU1((AMinF1(AFloorF1(ALog2F1(AF1(resolution))), AF1(12)))); + } +} + +A_STATIC void SpdSetup( + outAU2 dispatchThreadGroupCountXY, // CPU side: dispatch thread group count xy + outAU2 workGroupOffset, // GPU side: pass in as constant + outAU2 numWorkGroupsAndMips, // GPU side: pass in as constant + inAU4 rectInfo // left, top, width, height +) { + SpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1); +} +#endif // #ifdef A_CPU +//============================================================================================================================== +// NON-PACKED VERSION +//============================================================================================================================== +#ifdef A_GPU +#ifdef SPD_PACKED_ONLY + // Avoid compiler error + AF4 SpdLoadSourceImage(ASU2 p, AU1 slice){return AF4(0.0,0.0,0.0,0.0);} + AF4 SpdLoad(ASU2 p, AU1 slice){return AF4(0.0,0.0,0.0,0.0);} + void SpdStore(ASU2 p, AF4 value, AU1 mip, AU1 slice){} + AF4 SpdLoadIntermediate(AU1 x, AU1 y){return AF4(0.0,0.0,0.0,0.0);} + void SpdStoreIntermediate(AU1 x, AU1 y, AF4 value){} + AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3){return AF4(0.0,0.0,0.0,0.0);} +#endif // #ifdef SPD_PACKED_ONLY + +//_____________________________________________________________/\_______________________________________________________________ +#if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS) +#extension GL_KHR_shader_subgroup_quad:require +#endif + +void SpdWorkgroupShuffleBarrier() { +#ifdef A_GLSL + barrier(); +#endif +#ifdef A_HLSL + GroupMemoryBarrierWithGroupSync(); +#endif +} + +// Only last active workgroup should proceed +bool SpdExitWorkgroup(AU1 numWorkGroups, AU1 localInvocationIndex, AU1 slice) +{ + // global atomic counter + if (localInvocationIndex == 0) + { + SpdIncreaseAtomicCounter(slice); + } + SpdWorkgroupShuffleBarrier(); + return (SpdGetAtomicCounter() != (numWorkGroups - 1)); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// User defined: AF4 SpdReduce4(AF4 v0, AF4 v1, AF4 v2, AF4 v3); + +AF4 SpdReduceQuad(AF4 v) +{ + #if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS) + AF4 v0 = v; + AF4 v1 = subgroupQuadSwapHorizontal(v); + AF4 v2 = subgroupQuadSwapVertical(v); + AF4 v3 = subgroupQuadSwapDiagonal(v); + return SpdReduce4(v0, v1, v2, v3); + #elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS) + // requires SM6.0 + AU1 quad = WaveGetLaneIndex() & (~0x3); + AF4 v0 = v; + AF4 v1 = WaveReadLaneAt(v, quad | 1); + AF4 v2 = WaveReadLaneAt(v, quad | 2); + AF4 v3 = WaveReadLaneAt(v, quad | 3); + return SpdReduce4(v0, v1, v2, v3); + /* + // if SM6.0 is not available, you can use the AMD shader intrinsics + // the AMD shader intrinsics are available in AMD GPU Services (AGS) library: + // https://gpuopen.com/amd-gpu-services-ags-library/ + // works for DX11 + AF4 v0 = v; + AF4 v1; + v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + AF4 v2; + v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + AF4 v3; + v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + return SpdReduce4(v0, v1, v2, v3); + */ + #endif + return v; +} + +AF4 SpdReduceIntermediate(AU2 i0, AU2 i1, AU2 i2, AU2 i3) +{ + AF4 v0 = SpdLoadIntermediate(i0.x, i0.y); + AF4 v1 = SpdLoadIntermediate(i1.x, i1.y); + AF4 v2 = SpdLoadIntermediate(i2.x, i2.y); + AF4 v3 = SpdLoadIntermediate(i3.x, i3.y); + return SpdReduce4(v0, v1, v2, v3); +} + +AF4 SpdReduceLoad4(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice) +{ + AF4 v0 = SpdLoad(ASU2(i0), slice); + AF4 v1 = SpdLoad(ASU2(i1), slice); + AF4 v2 = SpdLoad(ASU2(i2), slice); + AF4 v3 = SpdLoad(ASU2(i3), slice); + return SpdReduce4(v0, v1, v2, v3); +} + +AF4 SpdReduceLoad4(AU2 base, AU1 slice) +{ + return SpdReduceLoad4( + AU2(base + AU2(0, 0)), + AU2(base + AU2(0, 1)), + AU2(base + AU2(1, 0)), + AU2(base + AU2(1, 1)), + slice); +} + +AF4 SpdReduceLoadSourceImage4(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice) +{ + AF4 v0 = SpdLoadSourceImage(ASU2(i0), slice); + AF4 v1 = SpdLoadSourceImage(ASU2(i1), slice); + AF4 v2 = SpdLoadSourceImage(ASU2(i2), slice); + AF4 v3 = SpdLoadSourceImage(ASU2(i3), slice); + return SpdReduce4(v0, v1, v2, v3); +} + +AF4 SpdReduceLoadSourceImage(AU2 base, AU1 slice) +{ +#ifdef SPD_LINEAR_SAMPLER + return SpdLoadSourceImage(ASU2(base), slice); +#else + return SpdReduceLoadSourceImage4( + AU2(base + AU2(0, 0)), + AU2(base + AU2(0, 1)), + AU2(base + AU2(1, 0)), + AU2(base + AU2(1, 1)), + slice); +#endif +} + +void SpdDownsampleMips_0_1_Intrinsics(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice) +{ + AF4 v[4]; + + ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2); + ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y); + v[0] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[0], 0, slice); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y); + v[1] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[1], 0, slice); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16); + v[2] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[2], 0, slice); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[3], 0, slice); + + if (mip <= 1) + return; + + v[0] = SpdReduceQuad(v[0]); + v[1] = SpdReduceQuad(v[1]); + v[2] = SpdReduceQuad(v[2]); + v[3] = SpdReduceQuad(v[3]); + + if ((localInvocationIndex % 4) == 0) + { + SpdStore(ASU2(workGroupID.xy * 16) + + ASU2(x/2, y/2), v[0], 1, slice); + SpdStoreIntermediate( + x/2, y/2, v[0]); + + SpdStore(ASU2(workGroupID.xy * 16) + + ASU2(x/2 + 8, y/2), v[1], 1, slice); + SpdStoreIntermediate( + x/2 + 8, y/2, v[1]); + + SpdStore(ASU2(workGroupID.xy * 16) + + ASU2(x/2, y/2 + 8), v[2], 1, slice); + SpdStoreIntermediate( + x/2, y/2 + 8, v[2]); + + SpdStore(ASU2(workGroupID.xy * 16) + + ASU2(x/2 + 8, y/2 + 8), v[3], 1, slice); + SpdStoreIntermediate( + x/2 + 8, y/2 + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1_LDS(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice) +{ + AF4 v[4]; + + ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2); + ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y); + v[0] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[0], 0, slice); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y); + v[1] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[1], 0, slice); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16); + v[2] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[2], 0, slice); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[3], 0, slice); + + if (mip <= 1) + return; + + for (int i = 0; i < 4; i++) + { + SpdStoreIntermediate(x, y, v[i]); + SpdWorkgroupShuffleBarrier(); + if (localInvocationIndex < 64) + { + v[i] = SpdReduceIntermediate( + AU2(x * 2 + 0, y * 2 + 0), + AU2(x * 2 + 1, y * 2 + 0), + AU2(x * 2 + 0, y * 2 + 1), + AU2(x * 2 + 1, y * 2 + 1) + ); + SpdStore(ASU2(workGroupID.xy * 16) + ASU2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice); + } + SpdWorkgroupShuffleBarrier(); + } + + if (localInvocationIndex < 64) + { + SpdStoreIntermediate(x + 0, y + 0, v[0]); + SpdStoreIntermediate(x + 8, y + 0, v[1]); + SpdStoreIntermediate(x + 0, y + 8, v[2]); + SpdStoreIntermediate(x + 8, y + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice); +#else + SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice); +#endif +} + + +void SpdDownsampleMip_2(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 64) + { + AF4 v = SpdReduceIntermediate( + AU2(x * 2 + 0, y * 2 + 0), + AU2(x * 2 + 1, y * 2 + 0), + AU2(x * 2 + 0, y * 2 + 1), + AU2(x * 2 + 1, y * 2 + 1) + ); + SpdStore(ASU2(workGroupID.xy * 8) + ASU2(x, y), v, mip, slice); + // store to LDS, try to reduce bank conflicts + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // ... + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + SpdStoreIntermediate(x * 2 + y % 2, y * 2, v); + } +#else + AF4 v = SpdLoadIntermediate(x, y); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(ASU2(workGroupID.xy * 8) + ASU2(x/2, y/2), v, mip, slice); + SpdStoreIntermediate(x + (y/2) % 2, y, v); + } +#endif +} + +void SpdDownsampleMip_3(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 16) + { + // x 0 x 0 + // 0 0 0 0 + // 0 x 0 x + // 0 0 0 0 + AF4 v = SpdReduceIntermediate( + AU2(x * 4 + 0 + 0, y * 4 + 0), + AU2(x * 4 + 2 + 0, y * 4 + 0), + AU2(x * 4 + 0 + 1, y * 4 + 2), + AU2(x * 4 + 2 + 1, y * 4 + 2) + ); + SpdStore(ASU2(workGroupID.xy * 4) + ASU2(x, y), v, mip, slice); + // store to LDS + // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 + // ... + // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 + // ... + // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x + // ... + SpdStoreIntermediate(x * 4 + y, y * 4, v); + } +#else + if (localInvocationIndex < 64) + { + AF4 v = SpdLoadIntermediate(x * 2 + y % 2,y * 2); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(ASU2(workGroupID.xy * 4) + ASU2(x/2, y/2), v, mip, slice); + SpdStoreIntermediate(x * 2 + y/2, y * 2, v); + } + } +#endif +} + +void SpdDownsampleMip_4(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 4) + { + // x 0 0 0 x 0 0 0 + // ... + // 0 x 0 0 0 x 0 0 + AF4 v = SpdReduceIntermediate( + AU2(x * 8 + 0 + 0 + y * 2, y * 8 + 0), + AU2(x * 8 + 4 + 0 + y * 2, y * 8 + 0), + AU2(x * 8 + 0 + 1 + y * 2, y * 8 + 4), + AU2(x * 8 + 4 + 1 + y * 2, y * 8 + 4) + ); + SpdStore(ASU2(workGroupID.xy * 2) + ASU2(x, y), v, mip, slice); + // store to LDS + // x x x x 0 ... + // 0 ... + SpdStoreIntermediate(x + y * 2, 0, v); + } +#else + if (localInvocationIndex < 16) + { + AF4 v = SpdLoadIntermediate(x * 4 + y,y * 4); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(ASU2(workGroupID.xy * 2) + ASU2(x/2, y/2), v, mip, slice); + SpdStoreIntermediate(x / 2 + y, 0, v); + } + } +#endif +} + +void SpdDownsampleMip_5(AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 1) + { + // x x x x 0 ... + // 0 ... + AF4 v = SpdReduceIntermediate( + AU2(0, 0), + AU2(1, 0), + AU2(2, 0), + AU2(3, 0) + ); + SpdStore(ASU2(workGroupID.xy), v, mip, slice); + } +#else + if (localInvocationIndex < 4) + { + AF4 v = SpdLoadIntermediate(localInvocationIndex,0); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(ASU2(workGroupID.xy), v, mip, slice); + } + } +#endif +} + +void SpdDownsampleMips_6_7(AU1 x, AU1 y, AU1 mips, AU1 slice) +{ + ASU2 tex = ASU2(x * 4 + 0, y * 4 + 0); + ASU2 pix = ASU2(x * 2 + 0, y * 2 + 0); + AF4 v0 = SpdReduceLoad4(tex, slice); + SpdStore(pix, v0, 6, slice); + + tex = ASU2(x * 4 + 2, y * 4 + 0); + pix = ASU2(x * 2 + 1, y * 2 + 0); + AF4 v1 = SpdReduceLoad4(tex, slice); + SpdStore(pix, v1, 6, slice); + + tex = ASU2(x * 4 + 0, y * 4 + 2); + pix = ASU2(x * 2 + 0, y * 2 + 1); + AF4 v2 = SpdReduceLoad4(tex, slice); + SpdStore(pix, v2, 6, slice); + + tex = ASU2(x * 4 + 2, y * 4 + 2); + pix = ASU2(x * 2 + 1, y * 2 + 1); + AF4 v3 = SpdReduceLoad4(tex, slice); + SpdStore(pix, v3, 6, slice); + + if (mips <= 7) return; + // no barrier needed, working on values only from the same thread + + AF4 v = SpdReduce4(v0, v1, v2, v3); + SpdStore(ASU2(x, y), v, 7, slice); + SpdStoreIntermediate(x, y, v); +} + +void SpdDownsampleNextFour(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips, AU1 slice) +{ + if (mips <= baseMip) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice); + + if (mips <= baseMip + 1) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice); + + if (mips <= baseMip + 2) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice); + + if (mips <= baseMip + 3) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3, slice); +} + +void SpdDownsample( + AU2 workGroupID, + AU1 localInvocationIndex, + AU1 mips, + AU1 numWorkGroups, + AU1 slice +) { + AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64); + AU1 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2); + AU1 y = sub_xy.y + 8 * ((localInvocationIndex >> 7)); + SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice); + + SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2, mips, slice); + + if (mips <= 6) return; + + if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice)) return; + + SpdResetAtomicCounter(slice); + + // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels. + SpdDownsampleMips_6_7(x, y, mips, slice); + + SpdDownsampleNextFour(x, y, AU2(0,0), localInvocationIndex, 8, mips, slice); +} + +void SpdDownsample( + AU2 workGroupID, + AU1 localInvocationIndex, + AU1 mips, + AU1 numWorkGroups, + AU1 slice, + AU2 workGroupOffset +) { + SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +//============================================================================================================================== +// PACKED VERSION +//============================================================================================================================== + +#ifdef A_HALF + +#ifdef A_GLSL +#extension GL_EXT_shader_subgroup_extended_types_float16:require +#endif + +AH4 SpdReduceQuadH(AH4 v) +{ + #if defined(A_GLSL) && !defined(SPD_NO_WAVE_OPERATIONS) + AH4 v0 = v; + AH4 v1 = subgroupQuadSwapHorizontal(v); + AH4 v2 = subgroupQuadSwapVertical(v); + AH4 v3 = subgroupQuadSwapDiagonal(v); + return SpdReduce4H(v0, v1, v2, v3); + #elif defined(A_HLSL) && !defined(SPD_NO_WAVE_OPERATIONS) + // requires SM6.0 + AU1 quad = WaveGetLaneIndex() & (~0x3); + AH4 v0 = v; + AH4 v1 = WaveReadLaneAt(v, quad | 1); + AH4 v2 = WaveReadLaneAt(v, quad | 2); + AH4 v3 = WaveReadLaneAt(v, quad | 3); + return SpdReduce4H(v0, v1, v2, v3); + /* + // if SM6.0 is not available, you can use the AMD shader intrinsics + // the AMD shader intrinsics are available in AMD GPU Services (AGS) library: + // https://gpuopen.com/amd-gpu-services-ags-library/ + // works for DX11 + AH4 v0 = v; + AH4 v1; + v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + AH4 v2; + v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + AH4 v3; + v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + return SpdReduce4H(v0, v1, v2, v3); + */ + #endif + return AH4(0.0, 0.0, 0.0, 0.0); + +} + +AH4 SpdReduceIntermediateH(AU2 i0, AU2 i1, AU2 i2, AU2 i3) +{ + AH4 v0 = SpdLoadIntermediateH(i0.x, i0.y); + AH4 v1 = SpdLoadIntermediateH(i1.x, i1.y); + AH4 v2 = SpdLoadIntermediateH(i2.x, i2.y); + AH4 v3 = SpdLoadIntermediateH(i3.x, i3.y); + return SpdReduce4H(v0, v1, v2, v3); +} + +AH4 SpdReduceLoad4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice) +{ + AH4 v0 = SpdLoadH(ASU2(i0), slice); + AH4 v1 = SpdLoadH(ASU2(i1), slice); + AH4 v2 = SpdLoadH(ASU2(i2), slice); + AH4 v3 = SpdLoadH(ASU2(i3), slice); + return SpdReduce4H(v0, v1, v2, v3); +} + +AH4 SpdReduceLoad4H(AU2 base, AU1 slice) +{ + return SpdReduceLoad4H( + AU2(base + AU2(0, 0)), + AU2(base + AU2(0, 1)), + AU2(base + AU2(1, 0)), + AU2(base + AU2(1, 1)), + slice); +} + +AH4 SpdReduceLoadSourceImage4H(AU2 i0, AU2 i1, AU2 i2, AU2 i3, AU1 slice) +{ + AH4 v0 = SpdLoadSourceImageH(ASU2(i0), slice); + AH4 v1 = SpdLoadSourceImageH(ASU2(i1), slice); + AH4 v2 = SpdLoadSourceImageH(ASU2(i2), slice); + AH4 v3 = SpdLoadSourceImageH(ASU2(i3), slice); + return SpdReduce4H(v0, v1, v2, v3); +} + +AH4 SpdReduceLoadSourceImageH(AU2 base, AU1 slice) +{ +#ifdef SPD_LINEAR_SAMPLER + return SpdLoadSourceImageH(ASU2(base), slice); +#else + return SpdReduceLoadSourceImage4H( + AU2(base + AU2(0, 0)), + AU2(base + AU2(0, 1)), + AU2(base + AU2(1, 0)), + AU2(base + AU2(1, 1)), + slice); +#endif +} + +void SpdDownsampleMips_0_1_IntrinsicsH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice) +{ + AH4 v[4]; + + ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2); + ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y); + v[0] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[0], 0, slice); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y); + v[1] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[1], 0, slice); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16); + v[2] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[2], 0, slice); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[3], 0, slice); + + if (mips <= 1) + return; + + v[0] = SpdReduceQuadH(v[0]); + v[1] = SpdReduceQuadH(v[1]); + v[2] = SpdReduceQuadH(v[2]); + v[3] = SpdReduceQuadH(v[3]); + + if ((localInvocationIndex % 4) == 0) + { + SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2, y/2), v[0], 1, slice); + SpdStoreIntermediateH(x/2, y/2, v[0]); + + SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2 + 8, y/2), v[1], 1, slice); + SpdStoreIntermediateH(x/2 + 8, y/2, v[1]); + + SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2, y/2 + 8), v[2], 1, slice); + SpdStoreIntermediateH(x/2, y/2 + 8, v[2]); + + SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x/2 + 8, y/2 + 8), v[3], 1, slice); + SpdStoreIntermediateH(x/2 + 8, y/2 + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1_LDSH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice) +{ + AH4 v[4]; + + ASU2 tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2); + ASU2 pix = ASU2(workGroupID.xy * 32) + ASU2(x, y); + v[0] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[0], 0, slice); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y); + v[1] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[1], 0, slice); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x, y + 16); + v[2] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[2], 0, slice); + + tex = ASU2(workGroupID.xy * 64) + ASU2(x * 2 + 32, y * 2 + 32); + pix = ASU2(workGroupID.xy * 32) + ASU2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[3], 0, slice); + + if (mips <= 1) + return; + + for (int i = 0; i < 4; i++) + { + SpdStoreIntermediateH(x, y, v[i]); + SpdWorkgroupShuffleBarrier(); + if (localInvocationIndex < 64) + { + v[i] = SpdReduceIntermediateH( + AU2(x * 2 + 0, y * 2 + 0), + AU2(x * 2 + 1, y * 2 + 0), + AU2(x * 2 + 0, y * 2 + 1), + AU2(x * 2 + 1, y * 2 + 1) + ); + SpdStoreH(ASU2(workGroupID.xy * 16) + ASU2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice); + } + SpdWorkgroupShuffleBarrier(); + } + + if (localInvocationIndex < 64) + { + SpdStoreIntermediateH(x + 0, y + 0, v[0]); + SpdStoreIntermediateH(x + 8, y + 0, v[1]); + SpdStoreIntermediateH(x + 0, y + 8, v[2]); + SpdStoreIntermediateH(x + 8, y + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mips, AU1 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice); +#else + SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice); +#endif +} + + +void SpdDownsampleMip_2H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 64) + { + AH4 v = SpdReduceIntermediateH( + AU2(x * 2 + 0, y * 2 + 0), + AU2(x * 2 + 1, y * 2 + 0), + AU2(x * 2 + 0, y * 2 + 1), + AU2(x * 2 + 1, y * 2 + 1) + ); + SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x, y), v, mip, slice); + // store to LDS, try to reduce bank conflicts + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // ... + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v); + } +#else + AH4 v = SpdLoadIntermediateH(x, y); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(ASU2(workGroupID.xy * 8) + ASU2(x/2, y/2), v, mip, slice); + SpdStoreIntermediateH(x + (y/2) % 2, y, v); + } +#endif +} + +void SpdDownsampleMip_3H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 16) + { + // x 0 x 0 + // 0 0 0 0 + // 0 x 0 x + // 0 0 0 0 + AH4 v = SpdReduceIntermediateH( + AU2(x * 4 + 0 + 0, y * 4 + 0), + AU2(x * 4 + 2 + 0, y * 4 + 0), + AU2(x * 4 + 0 + 1, y * 4 + 2), + AU2(x * 4 + 2 + 1, y * 4 + 2) + ); + SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x, y), v, mip, slice); + // store to LDS + // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 + // ... + // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 + // ... + // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x + // ... + SpdStoreIntermediateH(x * 4 + y, y * 4, v); + } +#else + if (localInvocationIndex < 64) + { + AH4 v = SpdLoadIntermediateH(x * 2 + y % 2,y * 2); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(ASU2(workGroupID.xy * 4) + ASU2(x/2, y/2), v, mip, slice); + SpdStoreIntermediateH(x * 2 + y/2, y * 2, v); + } + } +#endif +} + +void SpdDownsampleMip_4H(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 4) + { + // x 0 0 0 x 0 0 0 + // ... + // 0 x 0 0 0 x 0 0 + AH4 v = SpdReduceIntermediateH( + AU2(x * 8 + 0 + 0 + y * 2, y * 8 + 0), + AU2(x * 8 + 4 + 0 + y * 2, y * 8 + 0), + AU2(x * 8 + 0 + 1 + y * 2, y * 8 + 4), + AU2(x * 8 + 4 + 1 + y * 2, y * 8 + 4) + ); + SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x, y), v, mip, slice); + // store to LDS + // x x x x 0 ... + // 0 ... + SpdStoreIntermediateH(x + y * 2, 0, v); + } +#else + if (localInvocationIndex < 16) + { + AH4 v = SpdLoadIntermediateH(x * 4 + y,y * 4); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(ASU2(workGroupID.xy * 2) + ASU2(x/2, y/2), v, mip, slice); + SpdStoreIntermediateH(x / 2 + y, 0, v); + } + } +#endif +} + +void SpdDownsampleMip_5H(AU2 workGroupID, AU1 localInvocationIndex, AU1 mip, AU1 slice) +{ +#ifdef SPD_NO_WAVE_OPERATIONS + if (localInvocationIndex < 1) + { + // x x x x 0 ... + // 0 ... + AH4 v = SpdReduceIntermediateH( + AU2(0, 0), + AU2(1, 0), + AU2(2, 0), + AU2(3, 0) + ); + SpdStoreH(ASU2(workGroupID.xy), v, mip, slice); + } +#else + if (localInvocationIndex < 4) + { + AH4 v = SpdLoadIntermediateH(localInvocationIndex,0); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(ASU2(workGroupID.xy), v, mip, slice); + } + } +#endif +} + +void SpdDownsampleMips_6_7H(AU1 x, AU1 y, AU1 mips, AU1 slice) +{ + ASU2 tex = ASU2(x * 4 + 0, y * 4 + 0); + ASU2 pix = ASU2(x * 2 + 0, y * 2 + 0); + AH4 v0 = SpdReduceLoad4H(tex, slice); + SpdStoreH(pix, v0, 6, slice); + + tex = ASU2(x * 4 + 2, y * 4 + 0); + pix = ASU2(x * 2 + 1, y * 2 + 0); + AH4 v1 = SpdReduceLoad4H(tex, slice); + SpdStoreH(pix, v1, 6, slice); + + tex = ASU2(x * 4 + 0, y * 4 + 2); + pix = ASU2(x * 2 + 0, y * 2 + 1); + AH4 v2 = SpdReduceLoad4H(tex, slice); + SpdStoreH(pix, v2, 6, slice); + + tex = ASU2(x * 4 + 2, y * 4 + 2); + pix = ASU2(x * 2 + 1, y * 2 + 1); + AH4 v3 = SpdReduceLoad4H(tex, slice); + SpdStoreH(pix, v3, 6, slice); + + if (mips < 8) return; + // no barrier needed, working on values only from the same thread + + AH4 v = SpdReduce4H(v0, v1, v2, v3); + SpdStoreH(ASU2(x, y), v, 7, slice); + SpdStoreIntermediateH(x, y, v); +} + +void SpdDownsampleNextFourH(AU1 x, AU1 y, AU2 workGroupID, AU1 localInvocationIndex, AU1 baseMip, AU1 mips, AU1 slice) +{ + if (mips <= baseMip) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice); + + if (mips <= baseMip + 1) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice); + + if (mips <= baseMip + 2) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice); + + if (mips <= baseMip + 3) return; + SpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice); +} + +void SpdDownsampleH( + AU2 workGroupID, + AU1 localInvocationIndex, + AU1 mips, + AU1 numWorkGroups, + AU1 slice +) { + AU2 sub_xy = ARmpRed8x8(localInvocationIndex % 64); + AU1 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2); + AU1 y = sub_xy.y + 8 * ((localInvocationIndex >> 7)); + + SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice); + + SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice); + + if (mips < 7) return; + + if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice)) return; + + SpdResetAtomicCounter(slice); + + // After mip 6 there is only a single workgroup left that downsamples the remaining up to 64x64 texels. + SpdDownsampleMips_6_7H(x, y, mips, slice); + + SpdDownsampleNextFourH(x, y, AU2(0,0), localInvocationIndex, 8, mips, slice); +} + +void SpdDownsampleH( + AU2 workGroupID, + AU1 localInvocationIndex, + AU1 mips, + AU1 numWorkGroups, + AU1 slice, + AU2 workGroupOffset +) { + SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice); +} + +#endif // #ifdef A_HALF +#endif // #ifdef A_GPU \ No newline at end of file diff --git a/bin/data/shaders/graph/cull/comp.glsl b/bin/data/shaders/graph/cull/comp.glsl index e0eb3301..bc4041b9 100644 --- a/bin/data/shaders/graph/cull/comp.glsl +++ b/bin/data/shaders/graph/cull/comp.glsl @@ -5,7 +5,7 @@ #extension GL_EXT_samplerless_texture_functions : enable layout (constant_id = 0) const uint PASSES = 6; -layout (local_size_x = 32, local_size_y = 1, local_size_z = 1) in; +layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; #define COMPUTE 1 #define QUERY_MIPMAPS 1 @@ -73,106 +73,100 @@ layout (std140, binding = 3) buffer Objects { layout (binding = 4) uniform sampler2D samplerDepth; +shared vec4 sharedPlanes[PASSES][6]; + vec4 normalizePlane( vec4 p ) { - return p / length(p.xyz); -} - -bool frustumCull( uint id ) { - if ( PushConstant.passes == 0 ) return true; - - const DrawCommand drawCommand = drawCommands[id]; - const Instance instance = instances[drawCommand.instanceID]; - const Object object = objects[instance.objectID]; - - if ( drawCommand.indices == 0 || drawCommand.vertices == 0 ) return false; - - bool visible = false; - for ( uint pass = 0; pass < PushConstant.passes; ++pass ) { - mat4 mat = camera.viewport[pass].projection * camera.viewport[pass].view * object.model; - vec4 planes[6]; { - for (int i = 0; i < 3; ++i) - for (int j = 0; j < 2; ++j) { - planes[i*2+j].x = mat[0][3] + (j == 0 ? mat[0][i] : -mat[0][i]); - planes[i*2+j].y = mat[1][3] + (j == 0 ? mat[1][i] : -mat[1][i]); - planes[i*2+j].z = mat[2][3] + (j == 0 ? mat[2][i] : -mat[2][i]); - planes[i*2+j].w = mat[3][3] + (j == 0 ? mat[3][i] : -mat[3][i]); - planes[i*2+j] = normalizePlane( planes[i*2+j] ); - } - } - bool insideFrustum = true; - for ( uint p = 0; p < 6; ++p ) { - float d = max(instance.bounds.min.x * planes[p].x, instance.bounds.max.x * planes[p].x) - + max(instance.bounds.min.y * planes[p].y, instance.bounds.max.y * planes[p].y) - + max(instance.bounds.min.z * planes[p].z, instance.bounds.max.z * planes[p].z); - - if (d < -planes[p].w) { - insideFrustum = false; - break; - } - } - - if ( insideFrustum ) { - visible = true; - break; - } - } - return visible; -} - -bool occlusionCull( uint id ) { - if ( PushConstant.passes == 0 ) return true; - - const DrawCommand drawCommand = drawCommands[id]; - const Instance instance = instances[drawCommand.instanceID]; - const Object object = objects[instance.objectID]; - - bool visible = false; - for ( uint pass = 0; pass < PushConstant.passes; ++pass ) { - vec4 aabb; - vec4 sphere = aabbToSphere( instance.bounds ); - - float scale = length(object.model[0].xyz); - vec3 center = (camera.viewport[pass].view * object.model * vec4(sphere.xyz, 1)).xyz; - float radius = scale * sphere.w; - - mat4 proj = camera.viewport[pass].projection; - float znear = proj[3][2]; - float P00 = proj[0][0]; - float P11 = proj[1][1]; - - if ( projectSphere( center, radius, znear, P00, P11, aabb ) ) { - vec2 pyramidSize = vec2(textureSize( samplerDepth, 0 )); - - float width = (aabb.z - aabb.x) * pyramidSize.x; - float height = (aabb.w - aabb.y) * pyramidSize.y; - - float level = max(0.0, floor(log2(max(width, height)))); - - float d1 = textureLod(samplerDepth, vec2(aabb.x, aabb.y), level).x; - float d2 = textureLod(samplerDepth, vec2(aabb.z, aabb.y), level).x; - float d3 = textureLod(samplerDepth, vec2(aabb.x, aabb.w), level).x; - float d4 = textureLod(samplerDepth, vec2(aabb.z, aabb.w), level).x; - - float depth = min(min(d1, d2), min(d3, d4)); // min for reverse-z projection, max for standard - float depthSphere = znear / (center.z - radius); - - if ( depthSphere >= depth - DEPTH_BIAS ) { - visible = true; - break; - } - } else { - visible = true; - break; - } - } - return visible; + return p / length(p.xyz); } void main() { - const uint gID = gl_GlobalInvocationID.x; - if ( !(0 <= gID && gID < drawCommands.length()) ) return; + const uint gID = gl_GlobalInvocationID.x; + const uint lID = gl_LocalInvocationIndex; - bool visible = frustumCull( gID ); - if ( visible ) visible = occlusionCull( gID ); - drawCommands[gID].instances = visible ? 1 : 0; + if ( lID == 0 ) { + for (uint pass = 0; pass < PushConstant.passes; ++pass) { + mat4 mat = camera.viewport[pass].projection * camera.viewport[pass].view; + for (int i = 0; i < 3; ++i) + for (int j = 0; j < 2; ++j) { + sharedPlanes[pass][i*2+j].x = mat[0][3] + (j == 0 ? mat[0][i] : -mat[0][i]); + sharedPlanes[pass][i*2+j].y = mat[1][3] + (j == 0 ? mat[1][i] : -mat[1][i]); + sharedPlanes[pass][i*2+j].z = mat[2][3] + (j == 0 ? mat[2][i] : -mat[2][i]); + sharedPlanes[pass][i*2+j].w = mat[3][3] + (j == 0 ? mat[3][i] : -mat[3][i]); + sharedPlanes[pass][i*2+j] = normalizePlane( sharedPlanes[pass][i*2+j] ); + } + } + } + barrier(); + + if ( gID >= drawCommands.length() ) return; + + const DrawCommand drawCommand = drawCommands[gID]; + if ( drawCommand.indices == 0 || drawCommand.vertices == 0 ) return; + + const Instance instance = instances[drawCommand.instanceID]; + const Object object = objects[instance.objectID]; + + vec4 sphere = aabbToSphere( instance.bounds ); + vec3 worldCenter = (object.model * vec4(sphere.xyz, 1.0)).xyz; + + float scaleX = length(object.model[0].xyz); + float scaleY = length(object.model[1].xyz); + float scaleZ = length(object.model[2].xyz); + float maxScale = max(max(scaleX, scaleY), scaleZ); + float worldRadius = sphere.w * maxScale; + + bool isVisible = false; + for ( uint pass = 0; pass < PushConstant.passes; ++pass ) { + bool insideFrustum = true; + for ( int p = 0; p < 6; ++p ) { + if ( dot(sharedPlanes[pass][p].xyz, worldCenter) + sharedPlanes[pass][p].w < -worldRadius ) { + insideFrustum = false; + break; + } + } + if ( insideFrustum ) { + isVisible = true; + break; + } + } + + if ( isVisible ) { + isVisible = false; + for ( uint pass = 0; pass < PushConstant.passes; ++pass ) { + vec4 aabb; + vec3 viewCenter = ( camera.viewport[pass].view * vec4(worldCenter, 1.0) ).xyz; + + mat4 proj = camera.viewport[pass].projection; + float znear = proj[3][2]; + float P00 = proj[0][0]; + float P11 = proj[1][1]; + + if ( projectSphere(viewCenter, worldRadius, znear, P00, P11, aabb) ) { + vec2 pyramidSize = vec2(textureSize( samplerDepth, 0 )); + float width = (aabb.z - aabb.x) * pyramidSize.x; + float height = (aabb.w - aabb.y) * pyramidSize.y; + + float level = floor(log2(max(width, height))); + level = max(0.0, level); + + float d1 = textureLod(samplerDepth, vec2(aabb.x, aabb.y), level).x; + float d2 = textureLod(samplerDepth, vec2(aabb.z, aabb.y), level).x; + float d3 = textureLod(samplerDepth, vec2(aabb.x, aabb.w), level).x; + float d4 = textureLod(samplerDepth, vec2(aabb.z, aabb.w), level).x; + + float depth = min(min(d1, d2), min(d3, d4)); + float depthSphere = znear / (viewCenter.z - worldRadius); + + if ( depthSphere >= depth - DEPTH_BIAS ) { + isVisible = true; + break; + } + } else { + isVisible = true; + break; + } + } + } + + drawCommands[gID].instances = isVisible ? 1 : 0; } \ No newline at end of file diff --git a/bin/data/shaders/raytrace/shader.ray-gen.glsl b/bin/data/shaders/raytrace/shader.ray-gen.glsl index b9b9285b..a2608cd7 100644 --- a/bin/data/shaders/raytrace/shader.ray-gen.glsl +++ b/bin/data/shaders/raytrace/shader.ray-gen.glsl @@ -390,7 +390,7 @@ void main() { #endif { #if BLOOM - float brightness = dot(surface.fragment.rgb, vec3(0.2126, 0.7152, 0.0722)); + float brightness = luma(surface.fragment.rgb); vec4 outFragBright = brightness > ubo.threshold ? vec4(surface.fragment.rgb, 1.0) : vec4(0, 0, 0, 1); // imageStore(outImage, ivec2(gl_LaunchIDEXT.xy), outFragBright); #endif diff --git a/engine/inc/uf/engine/graph/graph.h b/engine/inc/uf/engine/graph/graph.h index 8a6253ba..48e14849 100644 --- a/engine/inc/uf/engine/graph/graph.h +++ b/engine/inc/uf/engine/graph/graph.h @@ -111,6 +111,8 @@ namespace pod { uf::renderer::Buffer material; uf::renderer::Buffer texture; uf::renderer::Buffer light; + + uf::renderer::Texture2D depthPyramid; } buffers; }/* storage*/; }; diff --git a/engine/inc/uf/ext/vulkan/device.h b/engine/inc/uf/ext/vulkan/device.h index 0c3472a9..39663151 100644 --- a/engine/inc/uf/ext/vulkan/device.h +++ b/engine/inc/uf/ext/vulkan/device.h @@ -28,6 +28,8 @@ namespace ext { operator VkCommandBuffer() { return handle; } }; + struct Texture; + struct UF_API Device { VkInstance instance; VkDebugUtilsMessengerEXT debugMessenger; @@ -86,6 +88,7 @@ namespace ext { uf::stl::vector buffers; uf::stl::vector ass; + uf::stl::vector textures; } transient; struct { diff --git a/engine/inc/uf/ext/vulkan/graphic.h b/engine/inc/uf/ext/vulkan/graphic.h index 01f5ac15..1a7e438b 100644 --- a/engine/inc/uf/ext/vulkan/graphic.h +++ b/engine/inc/uf/ext/vulkan/graphic.h @@ -42,8 +42,8 @@ namespace ext { void record( const Graphic& graphic, const GraphicDescriptor& descriptor, VkCommandBuffer, size_t = 0, size_t = 0, size_t = 0 ) const; void destroy(); - uf::stl::vector getShaders( uf::stl::vector& ); - uf::stl::vector getShaders( const uf::stl::vector& ) const; + uf::stl::vector getShaders( uf::stl::vector&, const uf::stl::string& = "" ); + uf::stl::vector getShaders( const uf::stl::vector&, const uf::stl::string& = "" ) const; void collectBuffers( const Shader& shader, const RenderMode& renderMode, const Graphic& graphic, const std::function& lambda ) const; }; diff --git a/engine/inc/uf/ext/vulkan/texture.h b/engine/inc/uf/ext/vulkan/texture.h index 7abc5447..76dd7a9c 100644 --- a/engine/inc/uf/ext/vulkan/texture.h +++ b/engine/inc/uf/ext/vulkan/texture.h @@ -138,7 +138,7 @@ namespace ext { inline void update( uf::Image& image, uint32_t layer = 1 ) { return this->update(image, this->imageLayout, layer); } inline void update( void* data, VkDeviceSize size, uint32_t layer = 1 ) { return this->update(data, size, this->imageLayout, layer); } - void generateMipmaps(VkCommandBuffer commandBuffer, uint32_t layer = 1); + void generateMipmaps(VkCommandBuffer commandBuffer, uint32_t layer = 0); void fromBuffers( void* buffer, VkDeviceSize bufferSize, VkFormat format, uint32_t texWidth, uint32_t texHeight, uint32_t texDepth, uint32_t layers, VkImageUsageFlags imageUsageFlags = VK_IMAGE_USAGE_SAMPLED_BIT, VkImageLayout imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL ); inline void fromBuffers( void* buffer, VkDeviceSize bufferSize, VkFormat format, uint32_t texWidth, uint32_t texHeight, VkImageUsageFlags imageUsageFlags = VK_IMAGE_USAGE_SAMPLED_BIT, VkImageLayout imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL ) { return this->fromBuffers(buffer, bufferSize, format, texWidth, texHeight, 1, 1, imageUsageFlags, imageLayout); } diff --git a/engine/inc/uf/ext/vulkan/vk.h b/engine/inc/uf/ext/vulkan/vk.h index 0daea033..6191b2d2 100644 --- a/engine/inc/uf/ext/vulkan/vk.h +++ b/engine/inc/uf/ext/vulkan/vk.h @@ -22,7 +22,7 @@ #define VK_DEFAULT_STAGE_BUFFERS ext::vulkan::settings::defaultStageBuffers #define VK_DEFAULT_DEFER_BUFFER_DESTROY ext::vulkan::settings::defaultDeferBufferDestroy #define VK_DEFAULT_COMMAND_BUFFER_IMMEDIATE ext::vulkan::settings::defaultCommandBufferImmediate -#define VK_UBO_USE_N_BUFFERS 1 +#define VK_UBO_USE_N_BUFFERS 0 namespace ext { namespace vulkan { diff --git a/engine/inc/uf/ext/vulkan/vulkan.h b/engine/inc/uf/ext/vulkan/vulkan.h index 1cb71fe4..7b28d4d8 100644 --- a/engine/inc/uf/ext/vulkan/vulkan.h +++ b/engine/inc/uf/ext/vulkan/vulkan.h @@ -182,10 +182,6 @@ namespace ext { extern UF_API uint32_t frameSkip; } - namespace gc { - extern UF_API uf::stl::vector textures; - } - extern UF_API Device device; extern UF_API Allocator allocator; diff --git a/engine/src/engine/ext/scene/behavior.cpp b/engine/src/engine/ext/scene/behavior.cpp index ca9ebffd..73162e20 100644 --- a/engine/src/engine/ext/scene/behavior.cpp +++ b/engine/src/engine/ext/scene/behavior.cpp @@ -938,8 +938,9 @@ void ext::ExtSceneBehavior::bindBuffers( uf::Object& self, uf::renderer::Graphic #if UF_USE_VULKAN // only update this when requested // done outside of deserialize because the rendermode might not be initialized in time - if ( uf::renderer::settings::pipelines::bloom && metadata.bloom.outOfDate && graphic.material.hasShader("compute", "bloom") ) { - auto& shader = graphic.material.getShader("compute", "bloom"); + if ( uf::renderer::settings::pipelines::bloom && metadata.bloom.outOfDate && graphic.material.hasShader("compute", "bloom-down") ) { + auto& shaderDown = graphic.material.getShader("compute", "bloom-down"); + auto& shaderUp = graphic.material.getShader("compute", "bloom-up"); struct UniformDescriptor { float threshold; @@ -974,7 +975,12 @@ void ext::ExtSceneBehavior::bindBuffers( uf::Object& self, uf::renderer::Graphic for ( auto i = 0; i < uniforms.size; ++i ) uniforms.weights[i] = tempWeights[i] / sum; metadata.bloom.outOfDate = false; - if ( shader.hasUniform("UBO") ) shader.updateBuffer( (const void*) &uniforms, sizeof(uniforms), shader.getUniformBuffer("UBO") ); + if ( shaderDown.hasUniform("UBO") ) { + shaderDown.updateBuffer( (const void*) &uniforms, sizeof(uniforms), shaderDown.getUniformBuffer("UBO") ); + } + if ( shaderUp.hasUniform("UBO") ) { + shaderUp.updateBuffer( (const void*) &uniforms, sizeof(uniforms), shaderUp.getUniformBuffer("UBO") ); + } } struct UniformDescriptor { diff --git a/engine/src/engine/graph/graph.cpp b/engine/src/engine/graph/graph.cpp index 8f5c8c67..6771d01f 100644 --- a/engine/src/engine/graph/graph.cpp +++ b/engine/src/engine/graph/graph.cpp @@ -196,7 +196,6 @@ namespace { // compute shader auto& shader = graphic.material.getShader("compute", uf::renderer::settings::pipelines::names::culling); - shader.aliasAttachment("depthPyramid"); } // vxgi pipeline if ( uf::renderer::settings::pipelines::vxgi ) { @@ -479,6 +478,9 @@ namespace { shader.aliasBuffer( "indirect", *indirect ); shader.aliasBuffer( "instance", storage.buffers.instance ); shader.aliasBuffer( "object", storage.buffers.object ); + + shader.textures.clear(); + shader.textures.emplace_back().aliasTexture( storage.buffers.depthPyramid ); } // vxgi pipeline @@ -1532,10 +1534,12 @@ void uf::graph::destroy( uf::Object& object, bool soft ) { void uf::graph::destroy( pod::Graph::Storage& storage, bool soft ) { soft = false; #if UF_USE_VULKAN +/* for ( auto& texture : uf::renderer::gc::textures ) { texture.destroy( false ); } uf::renderer::gc::textures.clear(); +*/ #endif // cleanup graphic handles diff --git a/engine/src/ext/vulkan/graphic.cpp b/engine/src/ext/vulkan/graphic.cpp index 8388fffb..e91821f4 100644 --- a/engine/src/ext/vulkan/graphic.cpp +++ b/engine/src/ext/vulkan/graphic.cpp @@ -41,7 +41,7 @@ void ext::vulkan::Pipeline::initialize( const Graphic& graphic, const GraphicDes this->metadata.type = descriptor.pipeline; Device& device = *graphic.device; - auto shaders = getShaders( graphic.material.shaders ); + auto shaders = getShaders( graphic.material.shaders, descriptor.pipeline ); assert( shaders.size() > 0 ); uint32_t subpass = descriptor.subpass; @@ -397,7 +397,10 @@ void ext::vulkan::Pipeline::record( const Graphic& graphic, VkCommandBuffer comm return record( graphic, descriptor, commandBuffer, pass, draw, offset ); } void ext::vulkan::Pipeline::record( const Graphic& graphic, const GraphicDescriptor& descriptor, VkCommandBuffer commandBuffer, size_t pass, size_t draw, size_t offset ) const { - auto shaders = getShaders( graphic.material.shaders ); + auto shaders = getShaders( graphic.material.shaders, descriptor.pipeline ); + for ( auto i = 0; i < shaders.size(); ++i ) { + // UF_MSG_DEBUG("{} | {}: {}", descriptor.pipeline, i, shaders[i]->filename); + } // create dynamic offset ranges static thread_local uf::stl::vector dynamicOffsets; @@ -427,6 +430,7 @@ void ext::vulkan::Pipeline::record( const Graphic& graphic, const GraphicDescrip else continue; } + // automatically bind to our default push constants if ( shader->metadata.definitions.pushConstants.count("PushConstant") > 0 ) { struct PushConstant { uint32_t pass; @@ -450,7 +454,10 @@ void ext::vulkan::Pipeline::record( const Graphic& graphic, const GraphicDescrip } // no matching bind point for shaders, skip - if ( !bound ) return; + if ( !bound ) { + UF_MSG_DEBUG("No shaders found to bind..."); + return; + } // Bind descriptor sets describing shader binding points #if VK_UBO_USE_N_BUFFERS @@ -506,7 +513,7 @@ void ext::vulkan::Pipeline::update( const Graphic& graphic, const GraphicDescrip RenderMode& renderMode = ext::vulkan::getRenderMode(descriptor.renderMode, true); auto& renderTarget = renderMode.getRenderTarget(/*descriptor.renderTarget*/); - auto shaders = getShaders( graphic.material.shaders ); + auto shaders = getShaders( graphic.material.shaders, descriptor.pipeline ); uf::stl::vector writeDescriptorSets; uf::stl::vector tlases; @@ -947,32 +954,32 @@ void ext::vulkan::Pipeline::destroy() { } */ } -uf::stl::vector ext::vulkan::Pipeline::getShaders( uf::stl::vector& shaders ) { +uf::stl::vector ext::vulkan::Pipeline::getShaders( uf::stl::vector& shaders, const uf::stl::string& type ) { uf::stl::unordered_map map; uf::stl::vector res; bool isCompute = false; for ( auto& shader : shaders ) { - if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != metadata.type ) continue; + if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != (type == "" ? metadata.type : type) ) continue; if ( shader.descriptor.stage == VK_SHADER_STAGE_COMPUTE_BIT ) isCompute = true; } for ( auto& shader : shaders ) { - if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != metadata.type ) continue; + if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != (type == "" ? metadata.type : type) ) continue; if ( isCompute && shader.descriptor.stage != VK_SHADER_STAGE_COMPUTE_BIT ) continue; map[shader.metadata.type] = &shader; } for ( auto pair : map ) res.insert( res.begin(), pair.second); return res; } -uf::stl::vector ext::vulkan::Pipeline::getShaders( const uf::stl::vector& shaders ) const { +uf::stl::vector ext::vulkan::Pipeline::getShaders( const uf::stl::vector& shaders, const uf::stl::string& type ) const { uf::stl::unordered_map map; uf::stl::vector res; bool isCompute = false; for ( auto& shader : shaders ) { - if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != metadata.type ) continue; + if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != (type == "" ? metadata.type : type) ) continue; if ( shader.descriptor.stage == VK_SHADER_STAGE_COMPUTE_BIT ) isCompute = true; } for ( auto& shader : shaders ) { - if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != metadata.type ) continue; + if ( shader.metadata.pipeline != "" && shader.metadata.pipeline != (type == "" ? metadata.type : type) ) continue; if ( isCompute && shader.descriptor.stage != VK_SHADER_STAGE_COMPUTE_BIT ) continue; map[shader.metadata.type] = &shader; } @@ -1839,19 +1846,19 @@ void ext::vulkan::Graphic::record( VkCommandBuffer commandBuffer, size_t pass, s void ext::vulkan::Graphic::record( VkCommandBuffer commandBuffer, const GraphicDescriptor& descriptor, size_t pass, size_t draw, size_t offset ) const { if ( !process ) return; if ( !this->hasPipeline( descriptor ) ) { - VK_DEBUG_VALIDATION_MESSAGE(this << ": has no valid pipeline ({} {})", descriptor.renderMode, descriptor.renderTarget); + //UF_MSG_DEBUG("{} has no valid pipeline ({}:{}:{})", (void*) this, descriptor.renderMode, descriptor.renderTarget, descriptor.pipeline); return; } auto& pipeline = this->getPipeline( descriptor ); if ( pipeline.descriptorSet == VK_NULL_HANDLE ) { - VK_DEBUG_VALIDATION_MESSAGE(this << ": has no valid pipeline descriptor set ({} {})", descriptor.renderMode, descriptor.renderTarget); + //UF_MSG_DEBUG("{} has no valid pipeline descriptor set ({}:{}:{})", (void*) this, descriptor.renderMode, descriptor.renderTarget, descriptor.pipeline); return; } if ( !pipeline.metadata.process ) return; pipeline.record(*this, descriptor, commandBuffer, pass, draw, offset); - auto shaders = pipeline.getShaders( material.shaders ); + auto shaders = pipeline.getShaders( material.shaders, descriptor.pipeline ); for ( auto* shader : shaders ) { if ( shader->descriptor.stage == VK_SHADER_STAGE_COMPUTE_BIT ) return; if ( diff --git a/engine/src/ext/vulkan/rendermodes/base.cpp b/engine/src/ext/vulkan/rendermodes/base.cpp index 6b992d49..7c3a6726 100644 --- a/engine/src/ext/vulkan/rendermodes/base.cpp +++ b/engine/src/ext/vulkan/rendermodes/base.cpp @@ -273,8 +273,8 @@ void ext::vulkan::BaseRenderMode::initialize( Device& device ) { // swapchain.destroy(); swapchain.initialize( device ); // bind swapchain images - images.resize( ext::vulkan::swapchain.buffers ); - VK_CHECK_RESULT(vkGetSwapchainImagesKHR( device, swapchain.swapChain, &swapchain.buffers, images.data())); + ::images.resize( ext::vulkan::swapchain.buffers ); + VK_CHECK_RESULT(vkGetSwapchainImagesKHR( device, swapchain.swapChain, &swapchain.buffers, ::images.data())); // create image views for swapchain images renderTarget.attachments.clear(); @@ -302,7 +302,7 @@ void ext::vulkan::BaseRenderMode::initialize( Device& device ) { colorAttachmentView.subresourceRange.layerCount = 1; colorAttachmentView.viewType = VK_IMAGE_VIEW_TYPE_2D; colorAttachmentView.flags = 0; - colorAttachmentView.image = images[frame]; + colorAttachmentView.image = ::images[frame]; VK_CHECK_RESULT(vkCreateImageView( device, &colorAttachmentView, nullptr, &renderTarget.attachments[frame].view)); VK_REGISTER_HANDLE( renderTarget.attachments[frame].view ); @@ -312,7 +312,7 @@ void ext::vulkan::BaseRenderMode::initialize( Device& device ) { renderTarget.attachments[frame].descriptor.layout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; renderTarget.attachments[frame].descriptor.usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; renderTarget.attachments[frame].descriptor.aliased = true; - renderTarget.attachments[frame].image = images[frame]; + renderTarget.attachments[frame].image = ::images[frame]; renderTarget.attachments[frame].mem = VK_NULL_HANDLE; metadata.attachments["color["+std::to_string((int) frame)+"]"] = attachmentIndex++; @@ -530,7 +530,7 @@ void ext::vulkan::BaseRenderMode::initialize( Device& device ) { // Create framebuffer { // Create a frame buffer for every image in the swapchain - renderTarget.framebuffers.resize(images.size()); + renderTarget.framebuffers.resize(::images.size()); for (size_t frame = 0; frame < renderTarget.framebuffers.size(); frame++) { std::array attachments; @@ -555,7 +555,7 @@ void ext::vulkan::BaseRenderMode::initialize( Device& device ) { #if 0 if ( true ) { auto commandBuffer = device.fetchCommandBuffer(uf::renderer::QueueEnum::TRANSFER); - for ( size_t frame = 0; frame < images.size(); ++frame ) { + for ( size_t frame = 0; frame < ::images.size(); ++frame ) { VkImageMemoryBarrier imageMemoryBarrier = {}; imageMemoryBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; imageMemoryBarrier.srcAccessMask = 0; @@ -652,10 +652,11 @@ void ext::vulkan::BaseRenderMode::destroy() { } for ( auto& image : ::images ) { - // vkDestroyImage( *device, image, nullptr ); + // vkDestroyImage( *device, image, nullptr ); // destroyed via vkDestroySwapchainKHR VK_UNREGISTER_HANDLE( image ); image = VK_NULL_HANDLE; } + ::images.clear(); ext::vulkan::RenderMode::destroy(); diff --git a/engine/src/ext/vulkan/rendermodes/deferred.cpp b/engine/src/ext/vulkan/rendermodes/deferred.cpp index d77a78d0..d4a6528f 100644 --- a/engine/src/ext/vulkan/rendermodes/deferred.cpp +++ b/engine/src/ext/vulkan/rendermodes/deferred.cpp @@ -26,22 +26,34 @@ namespace { const uf::stl::string DEFERRED_MODE = "compute"; - ext::vulkan::Texture depthPyramid; + uf::stl::vector depthPyramidViews; - - void cmdImageBarrier(VkCommandBuffer commandBuffer, VkImage image, VkAccessFlags srcAccess, VkAccessFlags dstAccess, VkImageLayout oldLayout, VkImageLayout newLayout) { - VkImageMemoryBarrier barrier{VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER}; - barrier.srcAccessMask = srcAccess; - barrier.dstAccessMask = dstAccess; - barrier.oldLayout = oldLayout; - barrier.newLayout = newLayout; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.image = image; - barrier.subresourceRange = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }; - - vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, NULL, 0, NULL, 1, &barrier); + uf::stl::vector bloomViews; + + ext::vulkan::Buffer atomicCounterBloom; + ext::vulkan::Buffer atomicCounterDepth; + + struct AtomicCounter { + uint32_t counter; }; + struct PushConstants { + uint32_t mips; + uint32_t numWorkGroups; + uint32_t workGroupOffset; + }; + + void destroyImageView( ext::vulkan::RenderMode* self, VkImageView view ) { + ext::vulkan::mutex.lock(); + auto& texture = self->device->transient.textures.emplace_back(); + ext::vulkan::mutex.unlock(); + + texture.device = self->device; + texture.view = view; + /* + vkDestroyImageView(self->device.logicalDevice, view, nullptr); + VK_UNREGISTER_HANDLE(view); + */ + } } #include "./transition.inl" @@ -64,8 +76,7 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) { struct { size_t id, bary, depth, uv, normal; - size_t color, bright, motion, scratch, output; - size_t depthPyramid; + size_t color, bright, motion, output; } attachments = {}; bool blend = true; // !ext::vulkan::settings::invariant::deferredSampling; @@ -108,7 +119,7 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) { attachments.depth = renderTarget.attach(RenderTarget::Attachment::Descriptor{ /*.format = */ext::vulkan::settings::formats::depth, /*.layout = */VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, - /*.usage = */VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT, + /*.usage = */VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT, /*.blend = */false, /*.samples = */msaa, //*.mips = */1, @@ -127,13 +138,7 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) { /*.usage =*/ VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, /*.blend =*/ blend, /*.samples =*/ 1, - }); - attachments.scratch = renderTarget.attach(RenderTarget::Attachment::Descriptor{ - /*.format =*/ ext::vulkan::settings::pipelines::hdr ? enums::Format::HDR : enums::Format::SDR, - /*.layout = */ VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, - /*.usage =*/ VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, - /*.blend =*/ blend, - /*.samples =*/ 1, + /*.mips =*/ mips, }); attachments.motion = renderTarget.attach(RenderTarget::Attachment::Descriptor{ // /*.format = */VK_FORMAT_R32G32B32A32_SFLOAT, @@ -143,14 +148,6 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) { /*.blend = */false, /*.samples = */1, }); - attachments.depthPyramid = renderTarget.attach(RenderTarget::Attachment::Descriptor{ - /*.format = */VK_FORMAT_R32_SFLOAT, - /*.layout = */ VK_IMAGE_LAYOUT_GENERAL, - /*.usage = */ VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, - /*.blend = */false, - /*.samples = */1, - /*.mips = */mips, - }); metadata.attachments["id"] = attachments.id; @@ -164,10 +161,8 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) { #endif metadata.attachments["depth"] = attachments.depth; - metadata.attachments["depthPyramid"] = attachments.depthPyramid; metadata.attachments["color"] = attachments.color; metadata.attachments["bright"] = attachments.bright; - metadata.attachments["scratch"] = attachments.scratch; metadata.attachments["motion"] = attachments.motion; metadata.attachments["output"] = attachments.color; @@ -339,13 +334,67 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) { } if ( settings::pipelines::bloom ) { - uf::stl::string computeShaderFilename = uf::io::resolveURI(uf::io::root+"/shaders/display/bloom/comp.spv"); - blitter.material.attachShader(computeShaderFilename, uf::renderer::enums::Shader::COMPUTE, "bloom"); + uf::stl::string computeShaderFilename = uf::io::resolveURI(uf::io::root+"/shaders/display/bloom/up.comp.spv"); + blitter.material.attachShader(computeShaderFilename, uf::renderer::enums::Shader::COMPUTE, "bloom-up"); + + auto& shader = blitter.material.getShader("compute", "bloom-up"); - auto& shader = blitter.material.getShader("compute", "bloom"); shader.aliasAttachment("color", this, VK_IMAGE_LAYOUT_GENERAL); shader.aliasAttachment("bright", this, VK_IMAGE_LAYOUT_GENERAL); - shader.aliasAttachment("scratch", this, VK_IMAGE_LAYOUT_GENERAL); + } + + if ( settings::pipelines::bloom ) { + uf::stl::string computeShaderFilename = uf::io::resolveURI(uf::io::root+"/shaders/display/bloom/down.comp.spv"); + blitter.material.attachShader(computeShaderFilename, uf::renderer::enums::Shader::COMPUTE, "bloom-down"); + + auto& shader = blitter.material.getShader("compute", "bloom-down"); + auto mips = uf::vector::mips( pod::Vector2ui{ width, height } ); + + shader.aliasAttachment("color", this, VK_IMAGE_LAYOUT_GENERAL); + shader.aliasAttachment("bright", this, VK_IMAGE_LAYOUT_GENERAL); + + shader.setSpecializationConstants({ + { "MIPS", mips }, + }); + shader.setDescriptorCounts({ + { "outImage", mips }, + }); + + // atomic counter buffer + ::atomicCounterBloom.initialize( (const void*) nullptr, sizeof(::AtomicCounter) * 1, uf::renderer::enums::Buffer::STORAGE ); + shader.aliasBuffer("atomicCounterBloom", ::atomicCounterBloom); + + for ( auto& view : ::bloomViews ) ::destroyImageView( this, view ); + ::bloomViews.clear(); + ::bloomViews.resize(mips); + shader.textures.clear(); + + ext::vulkan::Texture2D source; source.aliasAttachment( this->getAttachment("bright") ); + for ( auto i = 0; i < mips; ++i ) { + auto& view = ::bloomViews[i]; + VkImageViewCreateInfo viewCreateInfo = {}; + viewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + viewCreateInfo.pNext = NULL; + viewCreateInfo.components = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A }; + viewCreateInfo.subresourceRange.baseMipLevel = i; + viewCreateInfo.subresourceRange.layerCount = 1; + viewCreateInfo.subresourceRange.levelCount = 1; + viewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + viewCreateInfo.viewType = source.viewType; + viewCreateInfo.format = source.format; + viewCreateInfo.image = source.image; + + VK_CHECK_RESULT(vkCreateImageView(device.logicalDevice, &viewCreateInfo, nullptr, &view)); + VK_REGISTER_HANDLE(view); + + { + auto& texture = shader.textures.emplace_back(); + texture.aliasTexture( source ); + texture.view = view; + texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL; + texture.updateDescriptors(); + } + } } if ( settings::pipelines::culling ) { @@ -354,27 +403,31 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) { auto& shader = blitter.material.getShader("compute", "depth-pyramid"); auto mips = uf::vector::mips( pod::Vector2ui{ width, height } ); + // depth pyramid + shader.aliasAttachment("depth", this); + shader.setSpecializationConstants({ { "MIPS", mips }, }); shader.setDescriptorCounts({ - { "inImage", mips }, { "outImage", mips }, }); - shader.aliasAttachment("depth", this); + // atomic counter buffer + ::atomicCounterDepth.initialize( (const void*) nullptr, sizeof(::AtomicCounter) * 1, uf::renderer::enums::Buffer::STORAGE ); + shader.aliasBuffer("atomicCounterDepth", ::atomicCounterDepth); - ext::vulkan::Texture2D source; source.aliasAttachment( this->getAttachment("depthPyramid") ); - source.sampler.descriptor.reduction.enabled = true; - source.sampler.descriptor.reduction.mode = VK_SAMPLER_REDUCTION_MODE_MIN; - - for ( auto& view : ::depthPyramidViews ) { - vkDestroyImageView(device.logicalDevice, view, nullptr); - VK_UNREGISTER_HANDLE(view); - } + for ( auto& view : ::depthPyramidViews ) ::destroyImageView( this, view ); ::depthPyramidViews.clear(); ::depthPyramidViews.resize(mips); shader.textures.clear(); + + storage.buffers.depthPyramid.destroy(true); + storage.buffers.depthPyramid.fromBuffers( NULL, 0, VK_FORMAT_R32_SFLOAT, width, height, 1, 1, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT, VK_IMAGE_LAYOUT_GENERAL ); + + ext::vulkan::Texture2D& source = storage.buffers.depthPyramid; + source.sampler.descriptor.reduction.enabled = true; + source.sampler.descriptor.reduction.mode = VK_SAMPLER_REDUCTION_MODE_MIN; for ( auto i = 0; i < mips; ++i ) { auto& view = ::depthPyramidViews[i]; @@ -392,22 +445,14 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) { VK_CHECK_RESULT(vkCreateImageView(device.logicalDevice, &viewCreateInfo, nullptr, &view)); VK_REGISTER_HANDLE(view); - } - for ( auto i = 0; i < mips; ++i ) { - auto& texture = shader.textures.emplace_back(); - texture.aliasTexture( source ); - texture.view = ::depthPyramidViews[i]; - texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL; - texture.updateDescriptors(); - } - - for ( auto i = 0; i < mips; ++i ) { - auto& texture = shader.textures.emplace_back(); - texture.aliasTexture( source ); - texture.view = ::depthPyramidViews[i]; - texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL; - texture.updateDescriptors(); + { + auto& texture = shader.textures.emplace_back(); + texture.aliasTexture( source ); + texture.view = view; + texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL; + texture.updateDescriptors(); + } } } @@ -437,7 +482,18 @@ void ext::vulkan::DeferredRenderMode::initialize( Device& device ) { } if ( settings::pipelines::bloom ) { - descriptor.pipeline = "bloom"; + descriptor.aux = uf::vector::mips( pod::Vector2ui{ width, height } ); + descriptor.pipeline = "bloom-down"; + descriptor.subpass = 0; + descriptor.bind.point = VK_PIPELINE_BIND_POINT_COMPUTE; + if ( !blitter.hasPipeline( descriptor ) ) { + blitter.initializePipeline( descriptor ); + } + } + + if ( settings::pipelines::bloom ) { + descriptor.aux = {}; + descriptor.pipeline = "bloom-up"; descriptor.subpass = 0; descriptor.bind.point = VK_PIPELINE_BIND_POINT_COMPUTE; if ( !blitter.hasPipeline( descriptor ) ) { @@ -474,6 +530,49 @@ void ext::vulkan::DeferredRenderMode::tick() { rebuild = true; renderTarget.initialize( *renderTarget.device ); + if ( settings::pipelines::bloom ) { + auto& shader = blitter.material.getShader("compute", "bloom-down"); + auto mips = uf::vector::mips( pod::Vector2ui{ width, height } ); + shader.setSpecializationConstants({ + { "MIPS", mips }, + }); + shader.setDescriptorCounts({ + { "outImage", mips }, + }); + + for ( auto& view : ::bloomViews ) ::destroyImageView( this, view ); + ::bloomViews.clear(); + ::bloomViews.resize(mips); + shader.textures.clear(); + + ext::vulkan::Texture2D source; source.aliasAttachment( this->getAttachment("bright") ); + for ( auto i = 0; i < mips; ++i ) { + auto& view = ::bloomViews[i]; + VkImageViewCreateInfo viewCreateInfo = {}; + viewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + viewCreateInfo.pNext = NULL; + viewCreateInfo.components = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A }; + viewCreateInfo.subresourceRange.baseMipLevel = i; + viewCreateInfo.subresourceRange.layerCount = 1; + viewCreateInfo.subresourceRange.levelCount = 1; + viewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + viewCreateInfo.viewType = source.viewType; + viewCreateInfo.format = source.format; + viewCreateInfo.image = source.image; + + VK_CHECK_RESULT(vkCreateImageView(device->logicalDevice, &viewCreateInfo, nullptr, &view)); + VK_REGISTER_HANDLE(view); + + { + auto& texture = shader.textures.emplace_back(); + texture.aliasTexture( source ); + texture.view = view; + texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL; + texture.updateDescriptors(); + } + } + } + if ( settings::pipelines::culling ) { auto& shader = blitter.material.getShader("compute", "depth-pyramid"); auto mips = uf::vector::mips( pod::Vector2ui{ width, height } ); @@ -481,20 +580,17 @@ void ext::vulkan::DeferredRenderMode::tick() { { "MIPS", mips }, }); shader.setDescriptorCounts({ - { "inImage", mips }, { "outImage", mips }, }); - shader.aliasAttachment("depth", this); + storage.buffers.depthPyramid.destroy(true); + storage.buffers.depthPyramid.fromBuffers( NULL, 0, VK_FORMAT_R32_SFLOAT, width, height, 1, 1, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT, VK_IMAGE_LAYOUT_GENERAL ); - ext::vulkan::Texture2D source; source.aliasAttachment( this->getAttachment("depthPyramid") ); + ext::vulkan::Texture2D& source = storage.buffers.depthPyramid; source.sampler.descriptor.reduction.enabled = true; source.sampler.descriptor.reduction.mode = VK_SAMPLER_REDUCTION_MODE_MIN; - for ( auto& view : ::depthPyramidViews ) { - vkDestroyImageView(device->logicalDevice, view, nullptr); - VK_UNREGISTER_HANDLE(view); - } + for ( auto& view : ::depthPyramidViews ) ::destroyImageView( this, view ); ::depthPyramidViews.clear(); ::depthPyramidViews.resize(mips); shader.textures.clear(); @@ -515,23 +611,14 @@ void ext::vulkan::DeferredRenderMode::tick() { VK_CHECK_RESULT(vkCreateImageView(device->logicalDevice, &viewCreateInfo, nullptr, &view)); VK_REGISTER_HANDLE(view); - - } - for ( auto i = 0; i < mips; ++i ) { - auto& texture = shader.textures.emplace_back(); - texture.aliasTexture( source ); - texture.view = ::depthPyramidViews[i]; - texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL; - texture.updateDescriptors(); - } - - for ( auto i = 0; i < mips; ++i ) { - auto& texture = shader.textures.emplace_back(); - texture.aliasTexture( source ); - texture.view = ::depthPyramidViews[i]; - texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL; - texture.updateDescriptors(); + { + auto& texture = shader.textures.emplace_back(); + texture.aliasTexture( source ); + texture.view = view; + texture.imageLayout = VK_IMAGE_LAYOUT_GENERAL; + texture.updateDescriptors(); + } } } } @@ -582,7 +669,20 @@ void ext::vulkan::DeferredRenderMode::tick() { } if ( settings::pipelines::bloom ) { - descriptor.pipeline = "bloom"; + descriptor.aux = uf::vector::mips( pod::Vector2ui{ width, height } ); + descriptor.pipeline = "bloom-down"; + descriptor.subpass = 0; + descriptor.bind.point = VK_PIPELINE_BIND_POINT_COMPUTE; + if ( blitter.hasPipeline( descriptor ) ) { + blitter.getPipeline( descriptor ).update( blitter, descriptor ); + } else { + blitter.initializePipeline( descriptor ); + } + } + + if ( settings::pipelines::bloom ) { + descriptor.aux = {}; + descriptor.pipeline = "bloom-up"; descriptor.subpass = 0; descriptor.bind.point = VK_PIPELINE_BIND_POINT_COMPUTE; if ( blitter.hasPipeline( descriptor ) ) { @@ -659,6 +759,21 @@ void ext::vulkan::DeferredRenderMode::render() { //unlockMutex( this->mostRecentCommandPoolId ); } void ext::vulkan::DeferredRenderMode::destroy() { + // cleanup + ::atomicCounterDepth.destroy(false); + ::atomicCounterBloom.destroy(false); + + for ( auto& view : ::bloomViews ) { + vkDestroyImageView(device->logicalDevice, view, nullptr); + VK_UNREGISTER_HANDLE(view); + } + ::bloomViews.clear(); + for ( auto& view : ::depthPyramidViews ) { + vkDestroyImageView(device->logicalDevice, view, nullptr); + VK_UNREGISTER_HANDLE(view); + } + ::depthPyramidViews.clear(); + ext::vulkan::RenderMode::destroy(); } @@ -744,18 +859,6 @@ void ext::vulkan::DeferredRenderMode::createCommandBuffers( const uf::stl::vecto size_t currentSubpass = 0; - /* - // transition layers for read - for ( auto layer : layers ) { - layer->pipelineBarrier( commandBuffer, 0 ); - } - */ - // VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL - // VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL - - // VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL - // VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL - #if 1 for ( auto& attachment : renderTarget.attachments ) { // transition attachments to general attachments for imageStore @@ -863,11 +966,61 @@ void ext::vulkan::DeferredRenderMode::createCommandBuffers( const uf::stl::vecto ::transitionAttachmentsFrom( this, shader, commandBuffer ); } - if ( settings::pipelines::bloom && blitter.material.hasShader("compute", "bloom") ) { - auto& shader = blitter.material.getShader("compute", "bloom"); + if ( settings::pipelines::bloom && blitter.material.hasShader("compute", "bloom-down") ) { + auto& shader = blitter.material.getShader("compute", "bloom-down"); + auto mips = uf::vector::mips( pod::Vector2ui{ width, height } ); + + uint32_t dispatchX = (width + 63) / 64; + uint32_t dispatchY = (height + 63) / 64; + uint32_t numWorkGroups = dispatchX * dispatchY; + auto& pushConstant = shader.pushConstants.front().get<::PushConstants>(); + pushConstant = { + .mips = mips, + .numWorkGroups = numWorkGroups, + .workGroupOffset = 0, + }; + ext::vulkan::GraphicDescriptor descriptor = blitter.descriptor; descriptor.renderMode = ""; - descriptor.pipeline = "bloom"; + descriptor.aux = mips; + descriptor.pipeline = "bloom-down"; + descriptor.bind.width = dispatchX * 256; + descriptor.bind.height = dispatchY; + descriptor.bind.depth = metadata.eyes; + descriptor.bind.point = VK_PIPELINE_BIND_POINT_COMPUTE; + descriptor.subpass = 0; + + // reset counter buffer + vkCmdFillBuffer(commandBuffer, ::atomicCounterBloom.buffer, 0, 4, 0); + VkMemoryBarrier counterBarrier{VK_STRUCTURE_TYPE_MEMORY_BARRIER}; + counterBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + counterBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &counterBarrier, 0, nullptr, 0, nullptr); + + // transition attachments to general attachments for imageStore + device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "setImageLayout" ); + ::transitionAttachmentsTo( this, shader, commandBuffer ); + + // dispatch compute shader + device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "bloom[down]" ); + blitter.record( commandBuffer, descriptor ); + + /* + ext::vulkan::Texture2D source; + source.aliasAttachment( this->getAttachment("bright") ); + source.generateMipmaps( commandBuffer ); + */ + + // transition attachments back to shader read layouts + device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "setImageLayout" ); + ::transitionAttachmentsFrom( this, shader, commandBuffer ); + } + + if ( settings::pipelines::bloom && blitter.material.hasShader("compute", "bloom-up") ) { + auto& shader = blitter.material.getShader("compute", "bloom-up"); + ext::vulkan::GraphicDescriptor descriptor = blitter.descriptor; + descriptor.renderMode = ""; + descriptor.pipeline = "bloom-up"; descriptor.bind.width = width; descriptor.bind.height = height; descriptor.bind.depth = metadata.eyes; @@ -879,20 +1032,8 @@ void ext::vulkan::DeferredRenderMode::createCommandBuffers( const uf::stl::vecto ::transitionAttachmentsTo( this, shader, commandBuffer ); // dispatch compute shader - auto& attachmentColor = this->getAttachment("color"); // color - auto& attachmentBright = this->getAttachment("bright"); // bloom - auto& attachmentScratch = this->getAttachment("scratch"); // pingpong - - device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "bloom[1]" ); - blitter.record( commandBuffer, descriptor, 0, 1 ); - cmdImageBarrier( commandBuffer, attachmentScratch.image, VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_GENERAL ); - - device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "bloom[2]" ); - blitter.record( commandBuffer, descriptor, 0, 2 ); - cmdImageBarrier( commandBuffer, attachmentBright.image, VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, VK_IMAGE_LAYOUT_GENERAL, VK_IMAGE_LAYOUT_GENERAL ); - - device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "bloom[3]" ); - blitter.record( commandBuffer, descriptor, 0, 3 ); + device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "bloom[up]" ); + blitter.record( commandBuffer, descriptor ); // transition attachments back to shader read layouts device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "setImageLayout" ); @@ -900,80 +1041,51 @@ void ext::vulkan::DeferredRenderMode::createCommandBuffers( const uf::stl::vecto } // construct depth-pyramid - #if 1 if ( settings::pipelines::culling && blitter.material.hasShader("compute", "depth-pyramid") ) { auto& shader = blitter.material.getShader("compute", "depth-pyramid"); auto mips = uf::vector::mips( pod::Vector2ui{ width, height } ); + uint32_t dispatchX = (width + 63) / 64; + uint32_t dispatchY = (height + 63) / 64; + uint32_t numWorkGroups = dispatchX * dispatchY; + auto& pushConstant = shader.pushConstants.front().get<::PushConstants>(); + pushConstant = { + .mips = mips, + .numWorkGroups = numWorkGroups, + .workGroupOffset = 0, + }; + ext::vulkan::GraphicDescriptor descriptor = blitter.descriptor; descriptor.renderMode = ""; descriptor.aux = mips; descriptor.pipeline = "depth-pyramid"; + descriptor.bind.width = dispatchX * 256; + descriptor.bind.height = dispatchY; descriptor.bind.depth = metadata.eyes; descriptor.bind.point = VK_PIPELINE_BIND_POINT_COMPUTE; descriptor.subpass = 0; - // dispatch compute shader - VkMemoryBarrier memoryBarrier{VK_STRUCTURE_TYPE_MEMORY_BARRIER}; - memoryBarrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - memoryBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + // reset counter buffer + vkCmdFillBuffer(commandBuffer, ::atomicCounterDepth.buffer, 0, 4, 0); + VkMemoryBarrier counterBarrier{VK_STRUCTURE_TYPE_MEMORY_BARRIER}; + counterBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + counterBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &counterBarrier, 0, nullptr, 0, nullptr); device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "setImageLayout" ); ::transitionAttachmentsTo( this, shader, commandBuffer ); - for ( auto i = 0; i < mips; ++i ) { - // for some reason it dispatches at half the width without offsetting back... - descriptor.bind.width = std::max(1u, width >> (i - 1)); - descriptor.bind.height = std::max(1u, height >> (i - 1)); - - blitter.record(commandBuffer, descriptor, 0, i); - - vkCmdPipelineBarrier( commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_FLAGS_NONE, 1, &memoryBarrier, 0, NULL, 0, NULL ); - } + blitter.record(commandBuffer, descriptor); device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "setImageLayout" ); ::transitionAttachmentsFrom( this, shader, commandBuffer ); } - #endif + // post-renderpass commands VK_COMMAND_BUFFER_CALLBACK( CALLBACK_END, commandBuffer, frame, { device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "callback[end]" ); } ); - - #if 0 - if ( this->hasAttachment("depth") ) { - auto& attachment = this->getAttachment("depth"); - ext::vulkan::Texture texture; texture.aliasAttachment( attachment ); - texture.width = width; - texture.height = height; - texture.depth = 1; - - texture.imageLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL; - texture.descriptor.imageLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL; - #if 1 - imageMemoryBarrier.subresourceRange.layerCount = metadata.eyes; - imageMemoryBarrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; - uf::renderer::Texture::setImageLayout( commandBuffer, attachment.image, VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, imageMemoryBarrier.subresourceRange ); - #endif - - for ( size_t eye = 0; eye < metadata.eyes; ++eye ) { - texture.generateMipmaps(commandBuffer, eye); - } - - #if 1 - uf::renderer::Texture::setImageLayout( commandBuffer, attachment.image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL, imageMemoryBarrier.subresourceRange ); - imageMemoryBarrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - imageMemoryBarrier.subresourceRange.layerCount = 1; - #endif - } #endif - #endif - - /* - for ( auto layer : layers ) { - layer->pipelineBarrier( commandBuffer, 1 ); - } - */ } device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::END, "end" ); diff --git a/engine/src/ext/vulkan/rendermodes/transition.inl b/engine/src/ext/vulkan/rendermodes/transition.inl index 3cc15643..90d8feae 100644 --- a/engine/src/ext/vulkan/rendermodes/transition.inl +++ b/engine/src/ext/vulkan/rendermodes/transition.inl @@ -9,24 +9,41 @@ namespace { subresourceRange.baseMipLevel = 0; subresourceRange.levelCount = 1; subresourceRange.baseArrayLayer = 0; - subresourceRange.layerCount = 1; - subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; subresourceRange.layerCount = self->metadata.eyes; + subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; for ( auto& descriptor : shader.metadata.aliases.attachments ) { if ( descriptor.layout == VK_IMAGE_LAYOUT_UNDEFINED ) continue; VkImage image = VK_NULL_HANDLE; + VkImageLayout initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + size_t mips = 1; + if ( descriptor.renderMode ) { - if ( descriptor.renderMode->hasAttachment(descriptor.name) ) - image = descriptor.renderMode->getAttachment(descriptor.name).image; + if ( descriptor.renderMode->hasAttachment(descriptor.name) ) { + auto& attachment = descriptor.renderMode->getAttachment(descriptor.name); + image = attachment.image; + mips = attachment.descriptor.mips; + initialLayout = attachment.descriptor.layout; + } } else if ( self->hasAttachment(descriptor.name) ) { - if ( self->hasAttachment(descriptor.name) ) - image = self->getAttachment(descriptor.name).image; + if ( self->hasAttachment(descriptor.name) ) { + auto& attachment = self->getAttachment(descriptor.name); + image = attachment.image; + mips = attachment.descriptor.mips; + initialLayout = attachment.descriptor.layout; + } } if ( image == VK_NULL_HANDLE ) continue; + subresourceRange.baseMipLevel = 0; + subresourceRange.levelCount = 1; subresourceRange.aspectMask = descriptor.name == "depth" ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT; uf::renderer::Texture::setImageLayout( commandBuffer, image, layout, descriptor.layout, subresourceRange ); + if ( mips > 1 ) { + subresourceRange.baseMipLevel = 1; + subresourceRange.levelCount = VK_REMAINING_MIP_LEVELS; + uf::renderer::Texture::setImageLayout( commandBuffer, image, initialLayout, descriptor.layout, subresourceRange ); + } } } void transitionAttachmentsFrom( @@ -39,24 +56,41 @@ namespace { subresourceRange.baseMipLevel = 0; subresourceRange.levelCount = 1; subresourceRange.baseArrayLayer = 0; - subresourceRange.layerCount = 1; - subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; subresourceRange.layerCount = self->metadata.eyes; + subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; for ( auto& descriptor : shader.metadata.aliases.attachments ) { if ( descriptor.layout == VK_IMAGE_LAYOUT_UNDEFINED ) continue; VkImage image = VK_NULL_HANDLE; + VkImageLayout initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + size_t mips = 1; + if ( descriptor.renderMode ) { - if ( descriptor.renderMode->hasAttachment(descriptor.name) ) - image = descriptor.renderMode->getAttachment(descriptor.name).image; + if ( descriptor.renderMode->hasAttachment(descriptor.name) ) { + auto& attachment = descriptor.renderMode->getAttachment(descriptor.name); + image = attachment.image; + mips = attachment.descriptor.mips; + initialLayout = attachment.descriptor.layout; + } } else if ( self->hasAttachment(descriptor.name) ) { - if ( self->hasAttachment(descriptor.name) ) - image = self->getAttachment(descriptor.name).image; + if ( self->hasAttachment(descriptor.name) ) { + auto& attachment = self->getAttachment(descriptor.name); + image = attachment.image; + mips = attachment.descriptor.mips; + initialLayout = attachment.descriptor.layout; + } } if ( image == VK_NULL_HANDLE ) continue; + subresourceRange.baseMipLevel = 0; + subresourceRange.levelCount = 1; subresourceRange.aspectMask = descriptor.name == "depth" ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT; uf::renderer::Texture::setImageLayout( commandBuffer, image, descriptor.layout, layout, subresourceRange ); + if ( mips > 1 ) { + subresourceRange.baseMipLevel = 1; + subresourceRange.levelCount = VK_REMAINING_MIP_LEVELS; + uf::renderer::Texture::setImageLayout( commandBuffer, image, descriptor.layout, initialLayout, subresourceRange ); + } } } } \ No newline at end of file diff --git a/engine/src/ext/vulkan/shader.cpp b/engine/src/ext/vulkan/shader.cpp index fedc8af2..fc42393a 100644 --- a/engine/src/ext/vulkan/shader.cpp +++ b/engine/src/ext/vulkan/shader.cpp @@ -12,8 +12,8 @@ #include #include -#define VK_DEBUG_VALIDATION_MESSAGE(x)\ -// VK_VALIDATION_MESSAGE(x); +#define VK_DEBUG_VALIDATION_MESSAGE(...)\ + //VK_VALIDATION_MESSAGE(__VA_ARGS__); #define UF_SHADER_PARSE_AS_JSON 0 #if UF_SHADER_PARSE_AS_JSON @@ -94,7 +94,7 @@ ext::vulkan::userdata_t ext::vulkan::jsonToUserdata( const ext::json::Value& pay #if UF_SHADER_TRACK_NAMES uf::stl::string path = uf::string::join(variableName, "."); path = uf::string::replace( path, ".[", "[" ); - VK_VALIDATION_MESSAGE("[" << (byteBuffer - byteBufferStart) << " / "<< (byteBufferEnd - byteBuffer) <<"]\tInserting: " << path << " = " << value.dump()); + //VK_VALIDATION_MESSAGE("[" << (byteBuffer - byteBufferStart) << " / "<< (byteBufferEnd - byteBuffer) <<"]\tInserting: " << path << " = " << value.dump()); #endif // is strictly an int if ( value.is(true) ) { @@ -120,7 +120,7 @@ ext::vulkan::userdata_t ext::vulkan::jsonToUserdata( const ext::json::Value& pay #endif }; #if UF_SHADER_TRACK_NAMES - VK_VALIDATION_MESSAGE("Updating {} in {}", name, filename); + //VK_VALIDATION_MESSAGE("Updating {} in {}", name, filename); // VK_VALIDATION_MESSAGE("Iterator: " << (void*) byteBuffer << "\t" << (void*) byteBufferEnd << "\t" << (byteBufferEnd - byteBuffer)); #endif parse(payload); @@ -264,7 +264,7 @@ ext::vulkan::userdata_t ext::vulkan::jsonToUserdata( const ext::json::Value& pay #if UF_SHADER_TRACK_NAMES uf::stl::string path = uf::string::join(variableName, "."); path = uf::string::replace( path, ".[", "[" ); - VK_VALIDATION_MESSAGE("[" << (byteBuffer - byteBufferStart) << " / "<< (byteBufferEnd - byteBuffer) <<"]\tInserting: " << path << " = (" << primitive << ") " << input.dump()); + //VK_VALIDATION_MESSAGE("[" << (byteBuffer - byteBufferStart) << " / "<< (byteBufferEnd - byteBuffer) <<"]\tInserting: " << path << " = (" << primitive << ") " << input.dump()); #endif pushValue( primitive, input ); } @@ -275,12 +275,12 @@ ext::vulkan::userdata_t ext::vulkan::jsonToUserdata( const ext::json::Value& pay }; auto& definitions = metadata.json["definitions"]["uniforms"][name]; #if UF_SHADER_TRACK_NAMES - VK_VALIDATION_MESSAGE("Updating " << name << " in " << filename); - VK_VALIDATION_MESSAGE("Iterator: " << (void*) byteBuffer << "\t" << (void*) byteBufferEnd << "\t" << (byteBufferEnd - byteBuffer)); + //VK_VALIDATION_MESSAGE("Updating " << name << " in " << filename); + //VK_VALIDATION_MESSAGE("Iterator: " << (void*) byteBuffer << "\t" << (void*) byteBufferEnd << "\t" << (byteBufferEnd - byteBuffer)); #endif parseDefinition(payload, definitions); #if UF_SHADER_TRACK_NAMES - VK_VALIDATION_MESSAGE("Iterator: " << (void*) byteBuffer << "\t" << (void*) byteBufferEnd << "\t" << (byteBufferEnd - byteBuffer)); + //VK_VALIDATION_MESSAGE("Iterator: " << (void*) byteBuffer << "\t" << (void*) byteBufferEnd << "\t" << (byteBufferEnd - byteBuffer)); #endif #endif return userdata; @@ -489,14 +489,14 @@ void ext::vulkan::Shader::initialize( ext::vulkan::Device& device, const uf::stl size_t bufferSize = comp.get_declared_struct_size(base_type); if ( bufferSize <= 0 ) break; if ( bufferSize > device.properties.limits.maxUniformBufferRange ) { - VK_DEBUG_VALIDATION_MESSAGE("Invalid uniform buffer length of " << bufferSize << " for shader " << filename); + VK_DEBUG_VALIDATION_MESSAGE("Invalid uniform buffer length of {} for shader {}", bufferSize, filename); bufferSize = device.properties.limits.maxUniformBufferRange; } bufferSize = ALIGNED_SIZE( bufferSize, device.properties.limits.minUniformBufferOffsetAlignment ); { - VK_DEBUG_VALIDATION_MESSAGE("Uniform size of " << bufferSize << " for shader " << filename); + VK_DEBUG_VALIDATION_MESSAGE("Uniform size of {} for shader {}", bufferSize, filename); // auto& uniform = uniforms.emplace_back(); // uniform.create( bufferSize ); } @@ -564,7 +564,7 @@ void ext::vulkan::Shader::initialize( ext::vulkan::Device& device, const uf::stl #define LOOP_RESOURCES( key, type ) for ( size_t i = 0; i < res.key.size(); ++i ) {\ const auto& resource = res.key[i];\ - VK_DEBUG_VALIDATION_MESSAGE("["< device.properties.limits.maxPushConstantsSize ) { - VK_DEBUG_VALIDATION_MESSAGE("Invalid push constant length of " << size << " for shader " << filename); + VK_DEBUG_VALIDATION_MESSAGE("Invalid push constant length of {} for shader {}", size, filename); + //VK_DEBUG_VALIDATION_MESSAGE("Invalid push constant length of " << size << " for shader " << filename); size = device.properties.limits.maxPushConstantsSize; } - VK_DEBUG_VALIDATION_MESSAGE("Push constant size of " << size << " for shader " << filename); + VK_DEBUG_VALIDATION_MESSAGE("Push constant size of {} for shader {},", size, filename); { auto& pushConstant = pushConstants.emplace_back(); pushConstant.create( size ); @@ -724,7 +725,7 @@ void ext::vulkan::Shader::initialize( ext::vulkan::Device& device, const uf::stl specializationMapEntries.emplace_back(specializationMapEntry); } specializationConstants.create( specializationSize ); - VK_DEBUG_VALIDATION_MESSAGE("Specialization constants size of " << specializationSize << " for shader " << filename); + VK_DEBUG_VALIDATION_MESSAGE("Specialization constants size of {} for shader {}", specializationSize, filename); uint8_t* s = (uint8_t*) (void*) specializationConstants; size_t offset = 0; @@ -798,7 +799,7 @@ void ext::vulkan::Shader::initialize( ext::vulkan::Device& device, const uf::stl definition.validate = false; } break; default: { - VK_DEBUG_VALIDATION_MESSAGE("Unregistered specialization constant type at offset " << offset << " for shader " << filename ); + VK_DEBUG_VALIDATION_MESSAGE("Unregistered specialization constant type at offset {} for shader {}", offset, filename ); } break; } #if UF_SHADER_PARSE_AS_JSON @@ -806,7 +807,7 @@ void ext::vulkan::Shader::initialize( ext::vulkan::Device& device, const uf::stl member["size"] = size; member["default"] = member["value"]; metadata.json["specializationConstants"].emplace_back(member); - VK_DEBUG_VALIDATION_MESSAGE("Specialization constant: " << member["type"].as() << " " << name << " = " << member["value"].dump() << "; at offset " << offset << " for shader " << filename ); + //VK_DEBUG_VALIDATION_MESSAGE("Specialization constant: " << member["type"].as() << " " << name << " = " << member["value"].dump() << "; at offset " << offset << " for shader " << filename ); #endif memcpy( &s[offset], &buffer, size ); @@ -859,7 +860,7 @@ bool ext::vulkan::Shader::validate() { if ( it == uniforms.end() ) break; auto& uniform = *(it++); if ( uniform.data().len != buffer.allocationInfo.size ) { - VK_DEBUG_VALIDATION_MESSAGE("Uniform size mismatch: Expected " << buffer.allocationInfo.size << ", got " << uniform.data().len << "; fixing..."); + VK_DEBUG_VALIDATION_MESSAGE("Uniform size mismatch: Expected {}, got {}; fixing...", buffer.allocationInfo.size, uniform.data().len); uniform.destroy(); uniform.create(buffer.allocationInfo.size); valid = false; diff --git a/engine/src/ext/vulkan/texture.cpp b/engine/src/ext/vulkan/texture.cpp index 28ff0ba7..94c2963d 100644 --- a/engine/src/ext/vulkan/texture.cpp +++ b/engine/src/ext/vulkan/texture.cpp @@ -205,12 +205,15 @@ void ext::vulkan::Texture::destroy( bool defer ) { if ( !device || !device->logicalDevice || aliased ) return; // device->logicalDevice should never be null, but it happens, somehow if ( defer ) { - ext::vulkan::gc::textures.emplace_back( *this ); + ext::vulkan::mutex.lock(); + device->transient.textures.emplace_back(*this); + ext::vulkan::mutex.unlock(); return; } if ( view != VK_NULL_HANDLE ) { vkDestroyImageView(device->logicalDevice, view, nullptr); + VK_UNREGISTER_HANDLE( view ); view = VK_NULL_HANDLE; } if ( image != VK_NULL_HANDLE ) { @@ -581,6 +584,7 @@ void ext::vulkan::Texture::fromBuffers( viewCreateInfo.subresourceRange.levelCount = this->mips; viewCreateInfo.image = image; VK_CHECK_RESULT(vkCreateImageView(device.logicalDevice, &viewCreateInfo, nullptr, &view)); + VK_REGISTER_HANDLE( view ); { auto commandBuffer = device.fetchCommandBuffer(uf::renderer::QueueEnum::GRAPHICS); @@ -890,7 +894,7 @@ void ext::vulkan::Texture::generateMipmaps( VkCommandBuffer commandBuffer, uint3 int32_t mipWidth = width; int32_t mipHeight = height; - int32_t mipDepth = depth; + int32_t mipDepth = MAX(1, depth); for ( size_t i = 1; i < this->mips; ++i ) { // transition previous layer to read from it barrier.subresourceRange.baseMipLevel = i - 1; @@ -1150,7 +1154,7 @@ uf::Image ext::vulkan::Texture3D::screenshot( uint32_t layerID ) { imageCopy.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; imageCopy.dstSubresource.baseArrayLayer = 0; imageCopy.dstSubresource.layerCount = 1; - imageCopy.dstOffset = { 0, 0, 0 }; + imageCopy.dstOffset = { 0, 0, layerID }; imageCopy.extent = { this->width, this->height, 1 }; device->UF_CHECKPOINT_MARK( commandBuffer, pod::Checkpoint::GENERIC, "copyImage" ); diff --git a/engine/src/ext/vulkan/vulkan.cpp b/engine/src/ext/vulkan/vulkan.cpp index 52640966..6448be5e 100644 --- a/engine/src/ext/vulkan/vulkan.cpp +++ b/engine/src/ext/vulkan/vulkan.cpp @@ -116,8 +116,6 @@ uint32_t ext::vulkan::states::frameAccumulate = 0; bool ext::vulkan::states::frameAccumulateReset = false; uint32_t ext::vulkan::states::frameSkip = 0; -uf::stl::vector ext::vulkan::gc::textures; - uf::ThreadUnique ext::vulkan::currentRenderMode; ext::vulkan::Buffer ext::vulkan::scratchBuffer; @@ -495,6 +493,7 @@ void ext::vulkan::initialize( bool soft ) { void ext::vulkan::tick() { // ext::vulkan::mutex.lock(); if ( ext::vulkan::states::resized || ext::vulkan::settings::experimental::rebuildOnTickBegin ) { + synchronize(0b11); ext::vulkan::states::rebuild = true; ::skip = true; } @@ -534,13 +533,6 @@ void ext::vulkan::tick() { uf::thread::execute( tasks ); -/* - for ( auto& texture : ext::vulkan::gc::textures ) { - texture.destroy( false ); - } - ext::vulkan::gc::textures.clear(); -*/ - if ( ext::vulkan::states::rebuild && ext::vulkan::settings::experimental::skipRenderOnRebuild ) ::skip = true; ext::vulkan::states::rebuild = false; @@ -666,6 +658,9 @@ void ext::vulkan::render() { for ( auto& buffer : transient.buffers ) buffer.destroy(false); transient.buffers.clear(); + for ( auto& texture : transient.textures ) texture.destroy(false); + transient.textures.clear(); + for ( auto& as : transient.ass ) { uf::renderer::vkDestroyAccelerationStructureKHR(device, as.handle, nullptr); VK_UNREGISTER_HANDLE( as.handle ); @@ -677,7 +672,7 @@ void ext::vulkan::destroy( bool soft ) { ext::vulkan::flushCommandBuffers(); // ext::vulkan::mutex.lock(); - synchronize(); + synchronize(0b11); #if UF_USE_FFX_FSR if ( settings::pipelines::fsr ) { @@ -725,7 +720,7 @@ void ext::vulkan::destroy( bool soft ) { // ext::vulkan::mutex.unlock(); // check for any leaked resources - if ( false ) { + if ( ext::vulkan::settings::validation::checkpoints ) { UF_MSG_DEBUG("Leaked resources:"); for ( auto& resource : ext::vulkan::Resource::handles ) {