diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 044cb9f..dfe5c49 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -59,6 +59,10 @@ package_sample: - "media/cauldron-media/color_ramp_bt2020_dcip3/" - "media/cauldron-media/readme.md" - "media/cauldron-media/screenshot.png" + - "media/atlas.dds" + - "media/checkerboard.dds" + - "media/composition_text.dds" + - "media/lion.jpg" - "README.md" - "LICENSE.txt" - "%SampleName%_DX12.bat" diff --git a/CMakeLists.txt b/CMakeLists.txt index aacf16f..9a5424c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,7 @@ cmake_minimum_required(VERSION 3.12.1) option (GFX_API_DX12 "Build with DX12" ON) +option (GFX_API_VK "Build with Vulkan" ON) if(NOT DEFINED GFX_API) project (FSR2_Sample) diff --git a/LICENSE.txt b/LICENSE.txt index 699b8a3..19b21ff 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,3 +1,5 @@ +FidelityFX Super Resolution 2.1 +================================= Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/README.md b/README.md index e1b64dc..9f43d44 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# FidelityFX Super Resolution 2.0.1 (FSR 2.0) +# FidelityFX Super Resolution 2.1 (FSR 2.1) Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. @@ -21,11 +21,11 @@ THE SOFTWARE. ![Screenshot](screenshot.png) -AMD FidelityFX Super Resolution 2.0 (FSR 2) is an open source, high-quality solution for producing high resolution frames from lower resolution inputs. +AMD FidelityFX Super Resolution 2 (FSR 2) is an open source, high-quality solution for producing high resolution frames from lower resolution inputs. You can find the binaries for FidelityFX FSR in the release section on GitHub. -# Super Resolution 2.0 +# Super Resolution 2 ### Table of contents @@ -50,6 +50,7 @@ You can find the binaries for FidelityFX FSR in the release section on GitHub. - [Camera jitter](#camera-jitter) - [Camera jump cuts](#camera-jump-cuts) - [Mipmap biasing](#mipmap-biasing) + - [Frame Time Delta Input](#frame-time-delta-input) - [HDR support](#hdr-support) - [Falling back to 32bit floating point](#falling-back-to-32bit-floating-point) - [64-wide wavefronts](#64-wide-wavefronts) @@ -63,12 +64,12 @@ You can find the binaries for FidelityFX FSR in the release section on GitHub. - [Reproject & accumulate](#reproject-accumulate) - [Robust Contrast Adaptive Sharpening (RCAS)](#robust-contrast-adaptive-sharpening-rcas) - [Building the sample](#building-the-sample) +- [Limitations](#limitations) - [Version history](#version-history) -- [Limitations](release_notes.txt) - [References](#references) # Introduction -**FidelityFX Super Resolution 2.0** (or **FSR2** for short) is a cutting-edge upscaling technique developed from the ground up to produce high resolution frames from lower resolution inputs. +**FidelityFX Super Resolution 2** (or **FSR2** for short) is a cutting-edge upscaling technique developed from the ground up to produce high resolution frames from lower resolution inputs. ![alt text](docs/media/super-resolution-temporal/overview.svg "A diagram showing the input resources to the super resolution (temporal) algorithm.") @@ -100,19 +101,19 @@ To use FSR2 you should follow the steps below: 8. Create a backend for your target API. E.g. for DirectX12 you should call [`ffxFsr2GetInterfaceDX12`](src/ffx-fsr2-api/dx12/ffx_fsr2_dx12.h#L55). A scratch buffer should be allocated of the size returned by calling [`ffxFsr2GetScratchMemorySizeDX12`](src/ffx-fsr2-api/dx12/ffx_fsr2_dx12.h#L40) and the pointer to that buffer passed to [`ffxFsr2GetInterfaceDX12`](src/ffx-fsr2-api/dx12/ffx_fsr2_dx12.h#L55). -9. Create a FSR2 context by calling [`ffxFsr2ContextCreate`](src/ffx-fsr2-api/ffx_fsr2.h#L213). The parameters structure should be filled out matching the configuration of your application. See the API reference documentation for more details. +9. Create a FSR2 context by calling [`ffxFsr2ContextCreate`](src/ffx-fsr2-api/ffx_fsr2.h#L215). The parameters structure should be filled out matching the configuration of your application. See the API reference documentation for more details. -10. Each frame you should call [`ffxFsr2ContextDispatch`](src/ffx-fsr2-api/ffx_fsr2.h#L254) to launch FSR2 workloads. The parameters structure should be filled out matching the configuration of your application. See the API reference documentation for more details. +10. Each frame you should call [`ffxFsr2ContextDispatch`](src/ffx-fsr2-api/ffx_fsr2.h#L256) to launch FSR2 workloads. The parameters structure should be filled out matching the configuration of your application. See the API reference documentation for more details, and ensure the [`frameTimeDelta` field is provided in milliseconds](#frame-time-delta-input). -11. When your application is terminating (or you wish to destroy the context for another reason) you should call [`ffxFsr2ContextDestroy`](src/ffx-fsr2-api/ffx_fsr2.h#L277). The GPU should be idle before calling this function. +11. When your application is terminating (or you wish to destroy the context for another reason) you should call [`ffxFsr2ContextDestroy`](src/ffx-fsr2-api/ffx_fsr2.h#L279). The GPU should be idle before calling this function. -12. Sub-pixel jittering should be applied to your application's projection matrix. This should be done when performing the main rendering of your application. You should use the [`ffxFsr2GetJitterOffset`](src/ffx-fsr2-api/ffx_fsr2.h#L422) function to compute the precise jitter offsets. See [Camera jitter](#camera-jitter) section for more details. +12. Sub-pixel jittering should be applied to your application's projection matrix. This should be done when performing the main rendering of your application. You should use the [`ffxFsr2GetJitterOffset`](src/ffx-fsr2-api/ffx_fsr2.h#L424) function to compute the precise jitter offsets. See [Camera jitter](#camera-jitter) section for more details. -13. For the best upscaling quality it is strongly advised that you populate the [Reactive mask](#reactive-mask) and [Transparency & composition mask](#transparency-and-composition-mask) according to our guidelines. You can also use [`ffxFsr2ContextGenerateReactiveMask`](src/ffx-fsr2-api/ffx_fsr2.h#L265) as a starting point. +13. For the best upscaling quality it is strongly advised that you populate the [Reactive mask](#reactive-mask) and [Transparency & composition mask](#transparency-and-composition-mask) according to our guidelines. You can also use [`ffxFsr2ContextGenerateReactiveMask`](src/ffx-fsr2-api/ffx_fsr2.h#L267) as a starting point. 14. Applications should expose [scaling modes](#scaling-modes), in their user interface in the following order: Quality, Balanced, Performance, and (optionally) Ultra Performance. -15. Applications should also expose a sharpening slider to allow end users to acheive additional quality. +15. Applications should also expose a sharpening slider to allow end users to achieve additional quality. # Integration guidelines @@ -131,24 +132,24 @@ We strongly recommend that applications adopt consistent naming and scaling rati ## Performance Depending on your target hardware and operating configuration FSR2 will operate at different performance levels. -The table below summarizes the measured performance of FSR2 on a variety of hardware. +The table below summarizes the measured performance of FSR2 on a variety of hardware in DX12. | Target resolution | Quality | RX 6950 XT | RX 6900 XT | RX 6800 XT | RX 6800 | RX 6700 XT | RX 6600 XT | RX 5700 XT | RX Vega 56 | RX 590 | |-------------------|------------------|------------|------------|------------|---------|------------|------------|------------|------------|--------| -| 3840x2160 | Quality (1.5x) | 1.1ms | 1.2ms | 1.3ms | 1.6ms | 1.8ms | 3.0ms | 2.4ms | 3.7ms | 5.6ms | -| | Balanced (1.7x) | 1.0ms | 1.1ms | 1.1ms | 1.4ms | 1.7ms | 2.7ms | 2.2ms | 3.3ms | 5.3ms | -| | Performance (2x) | 0.9ms | 1.0ms | 1.0ms | 1.4ms | 1.5ms | 2.3ms | 2.0ms | 3.1ms | 4.9ms | -| | Ultra perf. (3x) | 0.8ms | 0.9ms | 0.9ms | 1.2ms | 1.4ms | 1.8ms | 1.7ms | 2.7ms | 4.3ms | -| 2560x1440 | Quality (1.5x) | 0.5ms | 0.5ms | 0.5ms | 0.7ms | 0.8ms | 1.2ms | 1.0ms | 1.6ms | 2.5ms | -| | Balanced (1.7x) | 0.4ms | 0.5ms | 0.5ms | 0.6ms | 0.8ms | 1.0ms | 1.0ms | 1.5ms | 2.4ms | -| | Performance (2x) | 0.4ms | 0.4ms | 0.4ms | 0.5ms | 0.7ms | 0.9ms | 0.9ms | 1.4ms | 2.2ms | -| | Ultra perf. (3x) | 0.3ms | 0.4ms | 0.4ms | 0.5ms | 0.6ms | 0.8ms | 0.7ms | 1.2ms | 1.9ms | -| 1920x1080 | Quality (1.5x) | 0.3ms | 0.3ms | 0.3ms | 0.3ms | 0.5ms | 0.6ms | 0.6ms | 0.9ms | 1.4ms | -| | Balanced (1.7x) | 0.2ms | 0.2ms | 0.3ms | 0.3ms | 0.4ms | 0.6ms | 0.5ms | 0.8ms | 1.3ms | -| | Performance (2x) | 0.2ms | 0.2ms | 0.2ms | 0.3ms | 0.4ms | 0.5ms | 0.5ms | 0.8ms | 1.3ms | -| | Ultra perf. (3x) | 0.2ms | 0.2ms | 0.2ms | 0.3ms | 0.4ms | 0.4ms | 0.4ms | 0.7ms | 1.1ms | +| 3840x2160 | Quality (1.5x) | 1.1ms | 1.2ms | 1.2ms | 1.3ms | 1.8ms | 3.0ms | 2.4ms | 4.8ms | 5.3ms | +| | Balanced (1.7x) | 1.0ms | 1.0ms | 1.1ms | 1.2ms | 1.6ms | 2.7ms | 2.1ms | 4.3ms | 4.8ms | +| | Performance (2x) | 0.8ms | 0.9ms | 0.9ms | 1.1ms | 1.5ms | 2.3ms | 1.9ms | 3.5ms | 4.2ms | +| | Ultra perf. (3x) | 0.7ms | 0.7ms | 0.7ms | 1.0ms | 1.3ms | 1.7ms | 1.6ms | 2.8ms | 3.5ms | +| 2560x1440 | Quality (1.5x) | 0.4ms | 0.4ms | 0.5ms | 0.6ms | 0.8ms | 1.2ms | 1.0ms | 1.8ms | 2.3ms | +| | Balanced (1.7x) | 0.4ms | 0.4ms | 0.4ms | 0.5ms | 0.7ms | 1.0ms | 0.9ms | 1.7ms | 2.1ms | +| | Performance (2x) | 0.4ms | 0.4ms | 0.4ms | 0.5ms | 0.7ms | 0.9ms | 0.8ms | 1.4ms | 1.9ms | +| | Ultra perf. (3x) | 0.3ms | 0.3ms | 0.3ms | 0.4ms | 0.6ms | 0.7ms | 0.7ms | 1.2ms | 1.6ms | +| 1920x1080 | Quality (1.5x) | 0.3ms | 0.3ms | 0.3ms | 0.3ms | 0.4ms | 0.6ms | 0.6ms | 1.0ms | 1.3ms | +| | Balanced (1.7x) | 0.2ms | 0.2ms | 0.2ms | 0.3ms | 0.4ms | 0.6ms | 0.5ms | 0.9ms | 1.2ms | +| | Performance (2x) | 0.2ms | 0.2ms | 0.2ms | 0.3ms | 0.4ms | 0.5ms | 0.5ms | 0.8ms | 1.1ms | +| | Ultra perf. (3x) | 0.2ms | 0.2ms | 0.2ms | 0.2ms | 0.3ms | 0.4ms | 0.4ms | 0.7ms | 0.9ms | -Figures are rounded to the nearest 0.1ms and are without [`enableSharpening`](src/ffx-fsr2-api/ffx_fsr2.h#L127) set. +Figures are rounded to the nearest 0.1ms and are without additional [`sharpness`](src/ffx-fsr2-api/ffx_fsr2.h#L129). ## Memory requirements Using FSR2 requires some additional GPU local memory to be allocated for consumption by the GPU. When using the FSR2 API, this memory is allocated when the FSR2 context is created, and is done so via the series of callbacks which comprise the backend interface. This memory is used to store intermediate surfaces which are computed by the FSR2 algorithm as well as surfaces which are persistent across many frames of the application. The table below includes the amount of memory used by FSR2 under various operating conditions. The "Working set" column indicates the total amount of memory used by FSR2 as the algorithm is executing on the GPU; this is the amount of memory FSR2 will require to run. The "Persistent memory" column indicates how much of the "Working set" column is required to be left intact for subsequent frames of the application; this memory stores the temporal data consumed by FSR2. The "Aliasable memory" column indicates how much of the "Working set" column may be aliased by surfaces or other resources used by the application outside of the operating boundaries of FSR2. @@ -157,41 +158,43 @@ You can take control of resource creation in FSR2 by overriding the resource cre | Resolution | Quality | Working set (MB) | Persistent memory (MB) | Aliasable memory (MB) | | -----------|------------------------|------------------|------------------------|-------------------------| -| 3840x2160 | Quality (1.5x) | 293.53MB | 94.92MB | 198.61MB | -| | Balanced (1.7x) | 274.03MB | 94.92MB | 179.11MB | -| | Performance (2x) | 255.68MB | 94.92MB | 160.76MB | -| | Ultra performance (3x) | 227.11MB | 94.92MB | 132.19MB | -| 2560x1440 | Quality (1.5x) | 136.41MB | 84.37MB | 52.04MB | -| | Balanced (1.7x) | 126.97MB | 84.37MB | 42.60MB | -| | Performance (2x) | 117.53MB | 84.37MB | 33.16MB | -| | Ultra performance (3x) | 104.95MB | 84.37MB | 20.58MB | -| 1920x1080 | Quality (1.5x) | 76.46MB | 47.46MB | 29.18MB | -| | Balanced (1.7x) | 71.75MB | 47.46MB | 23.68MB | -| | Performance (2x) | 67.81MB | 47.46MB | 20.79MB | -| | Ultra performance (3x) | 58.38MB | 47.46MB | 11.09MB | +| 3840x2160 | Quality (1.5x) | 302MB | 218MB | 85MB | +| | Balanced (1.7x) | 279MB | 214MB | 65MB | +| | Performance (2x) | 260MB | 211MB | 49MB | +| | Ultra performance (3x) | 228MB | 206MB | 22MB | +| 2560x1440 | Quality (1.5x) | 140MB | 100MB | 40MB | +| | Balanced (1.7x) | 129MB | 98MB | 33MB | +| | Performance (2x) | 119MB | 97MB | 24MB | +| | Ultra performance (3x) | 105MB | 95MB | 10MB | +| 1920x1080 | Quality (1.5x) | 78MB | 56MB | 22MB | +| | Balanced (1.7x) | 73MB | 55MB | 18MB | +| | Performance (2x) | 69MB | 54MB | 15MB | +| | Ultra performance (3x) | 59MB | 53MB | 6MB | + +Figures are rounded up to nearest MB and are without additional [`sharpness`](src/ffx-fsr2-api/ffx_fsr2.h#L129). Figures are approximations using an RX 6700XT GPU in DX12 and are subject to change. For details on how to manage FSR2's memory requirements please refer to the section of this document dealing with [Memory management](#memory-management). ## Input resources FSR2 is a temporal algorithm, and therefore requires access to data from both the current and previous frame. The following table enumerates all external inputs required by FSR2. -> The resolution column indicates if the data should be at 'rendered' resolution or 'presentation' resolution. 'Rendered' resolution indicates that the resource should match the resolution at which the application is performing its rendering. Conversely, 'presentation' indicates that the resolution of the target should match that which is to be presented to the user. All resources are from the current rendered frame, for DirectX(R)12 and Vulkan(R) applications all input resources should be transitioned to [`D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE`](https://docs.microsoft.com/en-us/windows/win32/api/d3d12/ne-d3d12-d3d12_resource_states) and [`VK_ACCESS_SHADER_READ_BIT`](https://www.khronos.org/registry/vulkan/specs/1.3-extensions/man/html/VkAccessFlagBits.html) respectively before calling [`ffxFsr2ContextDispatch`](src/ffx-fsr2-api/ffx_fsr2.h#L254). +> The resolution column indicates if the data should be at 'rendered' resolution or 'presentation' resolution. 'Rendered' resolution indicates that the resource should match the resolution at which the application is performing its rendering. Conversely, 'presentation' indicates that the resolution of the target should match that which is to be presented to the user. All resources are from the current rendered frame, for DirectX(R)12 and Vulkan(R) applications all input resources should be transitioned to [`D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE`](https://docs.microsoft.com/en-us/windows/win32/api/d3d12/ne-d3d12-d3d12_resource_states) and [`VK_ACCESS_SHADER_READ_BIT`](https://www.khronos.org/registry/vulkan/specs/1.3-extensions/man/html/VkAccessFlagBits.html) respectively before calling [`ffxFsr2ContextDispatch`](src/ffx-fsr2-api/ffx_fsr2.h#L256). | Name | Resolution | Format | Type | Notes | | ----------------|------------------------------|------------------------------------|-----------|------------------------------------------------| -| Color buffer | Render | `APPLICATION SPECIFIED` | Texture | The render resolution color buffer for the current frame provided by the application. If the contents of the color buffer are in high dynamic range (HDR), then the [`FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE`](src/ffx-fsr2-api/ffx_fsr2.h#L87) flag should be set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure. | -| Depth buffer | Render | `APPLICATION SPECIFIED (1x FLOAT)` | Texture | The render resolution depth buffer for the current frame provided by the application. The data should be provided as a single floating point value, the precision of which is under the application's control. The configuration of the depth should be communicated to FSR2 via the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L164). You should set the [`FFX_FSR2_ENABLE_DEPTH_INVERTED`](src/ffx-fsr2-api/ffx_fsr2.h#L90) flag if your depth buffer is inverted (that is [1..0] range), and you should set the [`FFX_FSR2_ENABLE_DEPTH_INFINITE`](src/ffx-fsr2-api/ffx_fsr2.h#L91) flag if your depth buffer has an infinite far plane. If the application provides the depth buffer in `D32S8` format, then FSR2 will ignore the stencil component of the buffer, and create an `R32_FLOAT` resource to address the depth buffer. On GCN and RDNA hardware, depth buffers are stored separately from stencil buffers. | -| Motion vectors | Render or presentation | `APPLICATION SPECIFIED (2x FLOAT)` | Texture | The 2D motion vectors for the current frame provided by the application in [**(<-width, -height>**..****] range. If your application renders motion vectors with a different range, you may use the [`motionVectorScale`](src/ffx-fsr2-api/ffx_fsr2.h#L125) field of the [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L114) structure to adjust them to match the expected range for FSR2. Internally, FSR2 uses 16-bit quantities to represent motion vectors in many cases, which means that while motion vectors with greater precision can be provided, FSR2 will not benefit from the increased precision. The resolution of the motion vector buffer should be equal to the render resolution, unless the [`FFX_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS`](src/ffx-fsr2-api/ffx_fsr2.h#L88) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L164), in which case it should be equal to the presentation resolution. | +| Color buffer | Render | `APPLICATION SPECIFIED` | Texture | The render resolution color buffer for the current frame provided by the application. If the contents of the color buffer are in high dynamic range (HDR), then the [`FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE`](src/ffx-fsr2-api/ffx_fsr2.h#L88) flag should be set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure. | +| Depth buffer | Render | `APPLICATION SPECIFIED (1x FLOAT)` | Texture | The render resolution depth buffer for the current frame provided by the application. The data should be provided as a single floating point value, the precision of which is under the application's control. The configuration of the depth should be communicated to FSR2 via the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L166). You should set the [`FFX_FSR2_ENABLE_DEPTH_INVERTED`](src/ffx-fsr2-api/ffx_fsr2.h#L91) flag if your depth buffer is inverted (that is [1..0] range), and you should set the [`FFX_FSR2_ENABLE_DEPTH_INFINITE`](src/ffx-fsr2-api/ffx_fsr2.h#L92) flag if your depth buffer has an infinite far plane. If the application provides the depth buffer in `D32S8` format, then FSR2 will ignore the stencil component of the buffer, and create an `R32_FLOAT` resource to address the depth buffer. On GCN and RDNA hardware, depth buffers are stored separately from stencil buffers. | +| Motion vectors | Render or presentation | `APPLICATION SPECIFIED (2x FLOAT)` | Texture | The 2D motion vectors for the current frame provided by the application in [**(<-width, -height>**..****] range. If your application renders motion vectors with a different range, you may use the [`motionVectorScale`](src/ffx-fsr2-api/ffx_fsr2.h#L126) field of the [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L115) structure to adjust them to match the expected range for FSR2. Internally, FSR2 uses 16-bit quantities to represent motion vectors in many cases, which means that while motion vectors with greater precision can be provided, FSR2 will not benefit from the increased precision. The resolution of the motion vector buffer should be equal to the render resolution, unless the [`FFX_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS`](src/ffx-fsr2-api/ffx_fsr2.h#L89) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L166), in which case it should be equal to the presentation resolution. | | Reactive mask | Render | `R8_UNORM` | Texture | As some areas of a rendered image do not leave a footprint in the depth buffer or include motion vectors, FSR2 provides support for a reactive mask texture which can be used to indicate to FSR2 where such areas are. Good examples of these are particles, or alpha-blended objects which do not write depth or motion vectors. If this resource is not set, then FSR2's shading change detection logic will handle these cases as best it can, but for optimal results, this resource should be set. For more information on the reactive mask please refer to the [Reactive mask](#reactive-mask) section. | -| Exposure | 1x1 | `R32_FLOAT` | Texture | A 1x1 texture containing the exposure value computed for the current frame. This resource is optional, and may be omitted if the [`FFX_FSR2_ENABLE_AUTO_EXPOSURE`](src/ffx-fsr2-api/ffx_fsr2.h#L92) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L164). | +| Exposure | 1x1 | `R32_FLOAT` | Texture | A 1x1 texture containing the exposure value computed for the current frame. This resource is optional, and may be omitted if the [`FFX_FSR2_ENABLE_AUTO_EXPOSURE`](src/ffx-fsr2-api/ffx_fsr2.h#L93) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L166). | ## Depth buffer configurations It is strongly recommended that an inverted, infinite depth buffer is used with FSR2. However, alternative depth buffer configurations are supported. An application should inform the FSR2 API of its depth buffer configuration by setting the appropriate flags during the creation of the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L164). The table below contains the appropriate flags. | FSR2 flag | Note | |----------------------------------|--------------------------------------------------------------------------------------------| -| [`FFX_FSR2_ENABLE_DEPTH_INVERTED`](src/ffx-fsr2-api/ffx_fsr2.h#L90) | A bit indicating that the input depth buffer data provided is inverted [max..0]. | -| [`FFX_FSR2_ENABLE_DEPTH_INFINITE`](src/ffx-fsr2-api/ffx_fsr2.h#L91) | A bit indicating that the input depth buffer data provided is using an infinite far plane. | +| [`FFX_FSR2_ENABLE_DEPTH_INVERTED`](src/ffx-fsr2-api/ffx_fsr2.h#L91) | A bit indicating that the input depth buffer data provided is inverted [max..0]. | +| [`FFX_FSR2_ENABLE_DEPTH_INFINITE`](src/ffx-fsr2-api/ffx_fsr2.h#L92) | A bit indicating that the input depth buffer data provided is using an infinite far plane. | ## Providing motion vectors @@ -201,11 +204,11 @@ A key part of a temporal algorithm (be it antialiasing or upscaling) is the prov ![alt text](docs/media/super-resolution-temporal/motion-vectors.svg "A diagram showing a 2D motion vector.") -If your application computes motion vectors in another space - for example normalized device coordinate space - then you may use the [`motionVectorScale`](src/ffx-fsr2-api/ffx_fsr2.h#L125) field of the [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L114) structure to instruct FSR2 to adjust them to match the expected range for FSR2. The code examples below illustrate how motion vectors may be scaled to screen space. The example HLSL and C++ code below illustrates how NDC-space motion vectors can be scaled using the FSR2 host API. +If your application computes motion vectors in another space - for example normalized device coordinate space - then you may use the [`motionVectorScale`](src/ffx-fsr2-api/ffx_fsr2.h#L126) field of the [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L115) structure to instruct FSR2 to adjust them to match the expected range for FSR2. The code examples below illustrate how motion vectors may be scaled to screen space. The example HLSL and C++ code below illustrates how NDC-space motion vectors can be scaled using the FSR2 host API. ```HLSL // GPU: Example of application NDC motion vector computation -float2 motionVector = (currentPosition.xy / currentPosition.w) - (previousPosition.xy / previousPosition.w); +float2 motionVector = (previousPosition.xy / previousPosition.w) - (currentPosition.xy / currentPosition.w); // CPU: Matching FSR 2.0 motionVectorScale configuration dispatchParameters.motionVectorScale.x = (float)renderWidth; @@ -213,43 +216,45 @@ dispatchParameters.motionVectorScale.y = (float)renderHeight; ``` ### Precision & resolution -Internally, FSR2 uses 16bit quantities to represent motion vectors in many cases, which means that while motion vectors with greater precision can be provided, FSR2 will not currently benefit from the increased precision. The resolution of the motion vector buffer should be equal to the render resolution, unless the [`FFX_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS`](src/ffx-fsr2-api/ffx_fsr2.h#L88) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L114) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L164), in which case it should be equal to the presentation resolution. +Internally, FSR2 uses 16bit quantities to represent motion vectors in many cases, which means that while motion vectors with greater precision can be provided, FSR2 will not currently benefit from the increased precision. The resolution of the motion vector buffer should be equal to the render resolution, unless the [`FFX_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS`](src/ffx-fsr2-api/ffx_fsr2.h#L89) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L115) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L166), in which case it should be equal to the presentation resolution. ### Coverage FSR2 will perform better quality upscaling when more objects provide their motion vectors. It is therefore advised that all opaque, alpha-tested and alpha-blended objects should write their motion vectors for all covered pixels. If vertex shader effects are applied - such as scrolling UVs - these calculations should also be factored into the calculation of motion for the best results. For alpha-blended objects it is also strongly advised that the alpha value of each covered pixel is stored to the corresponding pixel in the [reactive mask](#reactive-mask). This will allow FSR2 to perform better handling of alpha-blended objects during upscaling. The reactive mask is especially important for alpha-blended objects where writing motion vectors might be prohibitive, such as particles. ## Reactive mask -In the context of FSR2, the term "reactivity" means how much influence the samples rendered for the current frame have over the production of the final upscaled image. Typically, samples rendered for the current frame contribute a relatively modest amount to the result computed by FSR2; however, there are exceptions. To produce the best results for fast moving, alpha-blended objects, FSR2 requires the [Reproject & accumulate](#reproject-accumulate) stage to become more reactive for such pixels. As there is no good way to determine from either color, depth or motion vectors which pixels have been rendered using alpha blending, FSR2 performs best when applications explicity mark such areas. +In the context of FSR2, the term "reactivity" means how much influence the samples rendered for the current frame have over the production of the final upscaled image. Typically, samples rendered for the current frame contribute a relatively modest amount to the result computed by FSR2; however, there are exceptions. To produce the best results for fast moving, alpha-blended objects, FSR2 requires the [Reproject & accumulate](#reproject-accumulate) stage to become more reactive for such pixels. As there is no good way to determine from either color, depth or motion vectors which pixels have been rendered using alpha blending, FSR2 performs best when applications explicitly mark such areas. Therefore, it is strongly encouraged that applications provide a reactive mask to FSR2. The reactive mask guides FSR2 on where it should reduce its reliance on historical information when compositing the current pixel, and instead allow the current frame's samples to contribute more to the final result. The reactive mask allows the application to provide a value from [0..1] where 0 indicates that the pixel is not at all reactive (and should use the default FSR2 composition strategy), and a value of 1 indicates the pixel should be fully reactive. While there are other applications for the reactive mask, the primary application for the reactive mask is producing better results of upscaling images which include alpha-blended objects. A good proxy for reactiveness is actually the alpha value used when compositing an alpha-blended object into the scene, therefore, applications should write `alpha` to the reactive mask. It should be noted that it is unlikely that a reactive value of close to 1 will ever produce good results. Therefore, we recommend clamping the maximum reactive value to around 0.9. -If a [Reactive mask](#reactive-mask) is not provided to FSR2 (by setting the [`reactive`](src/ffx-fsr2-api/ffx_fsr2.h#L121) field of [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L114) to `NULL`) then an internally generated 1x1 texture with a cleared reactive value will be used. +If a [Reactive mask](#reactive-mask) is not provided to FSR2 (by setting the [`reactive`](src/ffx-fsr2-api/ffx_fsr2.h#L122) field of [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L115) to `NULL`) then an internally generated 1x1 texture with a cleared reactive value will be used. ## Transparency & composition mask In addition to the [Reactive mask](#reactive-mask), FSR2 provides for the application to denote areas of other specialist rendering which should be accounted for during the upscaling process. Examples of such special rendering include areas of raytraced reflections or animated textures. While the [Reactive mask](#reactive-mask) adjusts the accumulation balance, the [Transparency & composition mask](#transparency-and-composition-mask) adjusts the pixel locks created by FSR2. A pixel with a value of 0 in the [Transparency & composition mask](#ttransparency-and-composition-mask) does not perform any additional modification to the lock for that pixel. Conversely, a value of 1 denotes that the lock for that pixel should be completely removed. -If a [Transparency & composition mask](#transparency-and-composition-mask) is not provided to FSR2 (by setting the [`transparencyAndComposition`](#src/ffx-fsr2-api/ffx_fsr2.h#L122) field of [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L114) to `NULL`) then an internally generated 1x1 texture with a cleared transparency and composition value will be used. +If a [Transparency & composition mask](#transparency-and-composition-mask) is not provided to FSR2 (by setting the [`transparencyAndComposition`](#src/ffx-fsr2-api/ffx_fsr2.h#L123) field of [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L115) to `NULL`) then an internally generated 1x1 texture with a cleared transparency and composition value will be used. ## Automatically generating reactivity To help applications generate the [Reactive mask](#reactive-mask) and the [Transparency & composition mask](#transparency-and-composition-mask), FSR2 provides an optional helper API. Under the hood, the API launches a compute shader which computes these values for each pixel using a luminance-based heuristic. -Applications wishing to do this can call the [`ffxFsr2ContextGenerateReactiveMask`](src/ffx-fsr2-api/ffx_fsr2.h#L265) function and should pass two versions of the color buffer, one containing opaque only geometry, and the other containing both opaque and alpha-blended objects. +Applications wishing to do this can call the [`ffxFsr2ContextGenerateReactiveMask`](src/ffx-fsr2-api/ffx_fsr2.h#L267) function and should pass two versions of the color buffer, one containing opaque only geometry, and the other containing both opaque and alpha-blended objects. + +In version 2.1, this helper changed slightly in order to give developers more options when items such as decals were used, which may have resulted in shimmer on certain surfaces. A "binaryValue" can now be set in the FfxFsr2GenerateReactiveDescription struct, to provide a specific value to be written into the reactive mask instead of 1.0f, which can be too high. ## Exposure FSR2 provides two values which control the exposure used when performing upscaling. They are as follows: 1. **Pre-exposure** a value by which we divide the input signal to get back to the original signal produced by the game before any packing into lower precision render targets. -2. **Expsoure** a value which is multiplied against the result of the pre-exposed color value. +2. **Exposure** a value which is multiplied against the result of the pre-exposed color value. The exposure value should match that which the application uses during any subsequent tonemapping passes performed by the application. This means FSR2 will operate consistently with what is likely to be visible in the final tonemapped image. > In various stages of the FSR2 algorithm described in this document, FSR2 will compute its own exposure value for internal use. It is worth noting that all outputs from FSR2 will have this internal tonemapping reversed before the final output is written. Meaning that FSR2 returns results in the same domain as the original input signal. -Poorly selected exposure values can have a drastic impact on the final quality of FSR2's upscaling. Therefore, it is recommended that [`FFX_FSR2_ENABLE_AUTO_EXPOSURE`](src/ffx-fsr2-api/ffx_fsr2.h#L92) is used by the application, unless there is a particular reason not to. When [`FFX_FSR2_ENABLE_AUTO_EXPOSURE`](src/ffx-fsr2-api/ffx_fsr2.h#L92) is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure, the exposure calculation shown in the HLSL code below is used to compute the exposure value, this matches the exposure response of ISO 100 film stock. +Poorly selected exposure values can have a drastic impact on the final quality of FSR2's upscaling. Therefore, it is recommended that [`FFX_FSR2_ENABLE_AUTO_EXPOSURE`](src/ffx-fsr2-api/ffx_fsr2.h#L93) is used by the application, unless there is a particular reason not to. When [`FFX_FSR2_ENABLE_AUTO_EXPOSURE`](src/ffx-fsr2-api/ffx_fsr2.h#L93) is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure, the exposure calculation shown in the HLSL code below is used to compute the exposure value, this matches the exposure response of ISO 100 film stock. ```HLSL float ComputeAutoExposureFromAverageLog(float averageLogLuminance) @@ -274,7 +279,7 @@ With any image upscaling approach is it important to understand how to place oth | Post processing A | Post processing B | |--------------------------------|----------------------| | Screenspace reflections | Film grain | -| Screenspace ambient occlusion | Chromatic abberation | +| Screenspace ambient occlusion | Chromatic aberration | | Denoisers (shadow, reflections)| Vignette | | Exposure (optional) | Tonemapping | | | Bloom | @@ -327,7 +332,7 @@ Out of the box, the FSR2 API will compile into multiple libraries following the ## Memory management If the FSR2 API is used with one of the supplied backends (e.g: DirectX(R)12 or Vulkan(R)) then all the resources required by FSR2 are created as committed resources directly using the graphics device provided by the host application. However, by overriding the create and destroy family of functions present in the backend interface it is possible for an application to more precisely control the memory management of FSR2. -To do this, you can either provide a full custom backend to FSR2 via the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure passed to [`ffxFsr2ContextCreate`](src/ffx-fsr2-api/ffx_fsr2.h#L213) function, or you can retrieve the backend for your desired API and override the resource creation and destruction functions to handle them yourself. To do this, simply overwrite the [`fpCreateResource`](src/ffx-fsr2-api/ffx_fsr2_interface.h#L399) and [`fpDestroyResource`](src/ffx-fsr2-api/ffx_fsr2_interface.h#L403) function pointers. +To do this, you can either provide a full custom backend to FSR2 via the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure passed to [`ffxFsr2ContextCreate`](src/ffx-fsr2-api/ffx_fsr2.h#L215) function, or you can retrieve the backend for your desired API and override the resource creation and destruction functions to handle them yourself. To do this, simply overwrite the [`fpCreateResource`](src/ffx-fsr2-api/ffx_fsr2_interface.h#L360) and [`fpDestroyResource`](src/ffx-fsr2-api/ffx_fsr2_interface.h#L364) function pointers. ``` CPP // Setup DX12 interface. @@ -372,7 +377,7 @@ Internally, these function implement a Halton[2,3] sequence [[Halton](#reference ![alt text](docs/media/super-resolution-temporal/jitter-space.svg "A diagram showing how to map sub-pixel jitter offsets to projection offsets.") -It is important to understand that the values returned from the [`ffxFsr2GetJitterOffset`](src/ffx-fsr2-api/ffx_fsr2.h#L422) are in unit pixel space, and in order to composite this correctly into a projection matrix we must convert them into projection offsets. The diagram above shows a single pixel in unit pixel space, and in projection space. The code listing below shows how to correctly composite the sub-pixel jitter offset value into a projection matrix. +It is important to understand that the values returned from the [`ffxFsr2GetJitterOffset`](src/ffx-fsr2-api/ffx_fsr2.h#L424) are in unit pixel space, and in order to composite this correctly into a projection matrix we must convert them into projection offsets. The diagram above shows a single pixel in unit pixel space, and in projection space. The code listing below shows how to correctly composite the sub-pixel jitter offset value into a projection matrix. ``` CPP const int32_t jitterPhaseCount = ffxFsr2GetJitterPhaseCount(renderWidth, displayWidth); @@ -390,7 +395,7 @@ const Matrix4 jitteredProjectionMatrix = jitterTranslationMatrix * projectionMat Jitter should be applied to *all* rendering. This includes opaque, alpha transparent, and raytraced objects. For rasterized objects, the sub-pixel jittering values calculated by the [`ffxFsr2GetJitterOffset`](src/ffx-fsr2-api/ffx_fsr2.h#L422) function can be applied to the camera projection matrix which is ultimately used to perform transformations during vertex shading. For raytraced rendering, the sub-pixel jitter should be applied to the ray's origin - often the camera's position. -Whether you elect to use the recommended [`ffxFsr2GetJitterOffset`](src/ffx-fsr2-api/ffx_fsr2.h#L422) function or your own sequence generator, you must set the [`jitterOffset`](src/ffx-fsr2-api/ffx_fsr2.h#L124) field of the [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L114) structure to inform FSR2 of the jitter offset that has been applied in order to render each frame. Moreover, if not using the recommended [`ffxFsr2GetJitterOffset`](src/ffx-fsr2-api/ffx_fsr2.h#L422) function, care should be taken that your jitter sequence never generates a null vector; that is value of 0 in both the X and Y dimensions. +Whether you elect to use the recommended [`ffxFsr2GetJitterOffset`](src/ffx-fsr2-api/ffx_fsr2.h#L424) function or your own sequence generator, you must set the [`jitterOffset`](src/ffx-fsr2-api/ffx_fsr2.h#L125) field of the [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L115) structure to inform FSR2 of the jitter offset that has been applied in order to render each frame. Moreover, if not using the recommended [`ffxFsr2GetJitterOffset`](src/ffx-fsr2-api/ffx_fsr2.h#L424) function, care should be taken that your jitter sequence never generates a null vector; that is value of 0 in both the X and Y dimensions. The table below shows the jitter sequence length for each of the default quality modes. @@ -403,7 +408,7 @@ The table below shows the jitter sequence length for each of the default quality | Custom | [1..n]x (per dimension) | `ceil(8 * n^2)` | ## Camera jump cuts -Most applications with real-time rendering have a large degree of temporal consistency between any two consecutive frames. However, there are cases where a change to a camera's transformation might cause an abrupt change in what is rendered. In such cases, FSR2 is unlikely to be able to reuse any data it has accumulated from previous frames, and should clear this data such to exclude it from consideration in the compositing process. In order to indicate to FSR2 that a jump cut has occurred with the camera you should set the [`reset`](src/ffx-fsr2-api/ffx_fsr2.h#L131) field of the [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L114) structure to `true` for the first frame of the discontinuous camera transformation. +Most applications with real-time rendering have a large degree of temporal consistency between any two consecutive frames. However, there are cases where a change to a camera's transformation might cause an abrupt change in what is rendered. In such cases, FSR2 is unlikely to be able to reuse any data it has accumulated from previous frames, and should clear this data such to exclude it from consideration in the compositing process. In order to indicate to FSR2 that a jump cut has occurred with the camera you should set the [`reset`](src/ffx-fsr2-api/ffx_fsr2.h#L132) field of the [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L115) structure to `true` for the first frame of the discontinuous camera transformation. Rendering performance may be slightly less than typical frame-to-frame operation when using the reset flag, as FSR2 will clear some additional internal resources. @@ -425,8 +430,13 @@ The following table illustrates the mipmap biasing factor which results from eva | Performance | 2.0X (per dimension) | -2.0 | | Ultra performance | 3.0X (per dimension) | -2.58 | +## Frame Time Delta Input +The FSR2 API requires [`frameTimeDelta`](src/ffx-fsr2-api/ffx_fsr2.h#L130) be provided by the application through the [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L115) structure. This value is in __milliseconds__: if running at 60fps, the value passed should be around __16.6f__. + +The value is used within the temporal component of the FSR 2 auto-exposure feature. This allows for tuning of the history accumulation for quality purposes. + ## HDR support -High dynamic range images are supported in FSR2. To enable this, you should set the [`FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE`](src/ffx-fsr2-api/ffx_fsr2.h#L87) bit in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure. Images should be provided to FSR2 in linear color space. +High dynamic range images are supported in FSR2. To enable this, you should set the [`FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE`](src/ffx-fsr2-api/ffx_fsr2.h#L88) bit in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure. Images should be provided to FSR2 in linear color space. > Support for additional color spaces might be provided in a future revision of FSR2. @@ -473,7 +483,7 @@ Each pass stage of the algorithm is laid out in the sections following this one, The compute luminance pyramid stage has two responsibilities: 1. To produce a lower resolution version of the input color's luminance. This is used by shading change detection in the accumulation pass. -2. To produce a 1x1 exposure texture which is optionally used by the exposure calculations of the [Adjust input color](#adjust-input-color) stage to apply tonemapping, and the [Reproject & Accumulate](#project-and-accumulate) stage for reversing local tonemapping ahead of producing an ouput from FSR2. +2. To produce a 1x1 exposure texture which is optionally used by the exposure calculations of the [Adjust input color](#adjust-input-color) stage to apply tonemapping, and the [Reproject & Accumulate](#project-and-accumulate) stage for reversing local tonemapping ahead of producing an output from FSR2. ### Resource inputs @@ -483,7 +493,7 @@ The following table contains all resources consumed by the [Compute luminance py | Name | Temporal layer | Resolution | Format | Type | Notes | | ----------------|-----------------|--------------|-------------------------|-----------|----------------------------------------------| -| Color buffer | Current frame | Render | `APPLICATION SPECIFIED` | Texture | The render resolution color buffer for the current frame provided by the application. If the contents of the color buffer are in high dynamic range (HDR), then the [`FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE`](src/ffx-fsr2-api/ffx_fsr2.h#L87) flag should be set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure. | +| Color buffer | Current frame | Render | `APPLICATION SPECIFIED` | Texture | The render resolution color buffer for the current frame provided by the application. If the contents of the color buffer are in high dynamic range (HDR), then the [`FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE`](src/ffx-fsr2-api/ffx_fsr2.h#L87) flag should be set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure. | ### Resource outputs The following table contains all resources produced or modified by the [Compute luminance pyramid](#compute-luminance-pyramid) stage. @@ -492,11 +502,11 @@ The following table contains all resources produced or modified by the [Compute | Name | Temporal layer | Resolution | Format | Type | Notes | | ----------------------------|-----------------|------------------|-------------------------|-----------|----------------------------------------------| -| Exposure | Current frame | 1x1 | `R32_FLOAT` | Texture | A 1x1 texture containing the exposure value computed for the current frame. This resource is optional, and may be omitted if the [`FFX_FSR2_ENABLE_AUTO_EXPOSURE`](src/ffx-fsr2-api/ffx_fsr2.h#L92) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L164). | +| Exposure | Current frame | 1x1 | `R32_FLOAT` | Texture | A 1x1 texture containing the exposure value computed for the current frame. This resource is optional, and may be omitted if the [`FFX_FSR2_ENABLE_AUTO_EXPOSURE`](src/ffx-fsr2-api/ffx_fsr2.h#L92) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L166). | | Current luminance | Current frame | `Render * 0.5` | `R16_FLOAT` | Texture | A texture at 50% of render resolution texture which contains the luminance of the current frame. | ### Description -The [Compute luminance pyramid](#compute-luminance-pyramid) stage is implemented using FidelityFX [Single Pass Downsampler](single-pass-downsampler.md), an optimized technique for producing mipmap chains using a single compute shader dispatch. Instead of the conventional (full) pyramidal approach, SPD provides a mechanism to produce a specific set of mipmap levels for an arbitrary input texture, as well as performing arbitrary calculations on that data as we store it to the target location in memory. In FSR2, we are interested in producing in upto two intermediate resources depending on the configuration of the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L164). The first resource is a low-resolution representation of the current luminance, this is used later in FSR2 to attempt to detect shading changes. The second is the exposure value, and while it is always computed, it is only used by subsequent stages if the [`FFX_FSR2_ENABLE_AUTO_EXPOSURE`](src/ffx-fsr2-api/ffx_fsr2.h#L92) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure upon context creation. The exposure value - either from the application, or the [Compute luminance pyramid](#compute-luminance-pyramid) stage - is used in the [Adjust input color](#adjust-input-color) stage of FSR2, as well as by the [Reproject & Accumulate](#project-and-accumulate) stage. +The [Compute luminance pyramid](#compute-luminance-pyramid) stage is implemented using FidelityFX [Single Pass Downsampler](single-pass-downsampler.md), an optimized technique for producing mipmap chains using a single compute shader dispatch. Instead of the conventional (full) pyramidal approach, SPD provides a mechanism to produce a specific set of mipmap levels for an arbitrary input texture, as well as performing arbitrary calculations on that data as we store it to the target location in memory. In FSR2, we are interested in producing in upto two intermediate resources depending on the configuration of the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L166). The first resource is a low-resolution representation of the current luminance, this is used later in FSR2 to attempt to detect shading changes. The second is the exposure value, and while it is always computed, it is only used by subsequent stages if the [`FFX_FSR2_ENABLE_AUTO_EXPOSURE`](src/ffx-fsr2-api/ffx_fsr2.h#L93) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure upon context creation. The exposure value - either from the application, or the [Compute luminance pyramid](#compute-luminance-pyramid) stage - is used in the [Adjust input color](#adjust-input-color) stage of FSR2, as well as by the [Reproject & Accumulate](#project-and-accumulate) stage. ![alt text](docs/media/super-resolution-temporal/auto-exposure.svg "A diagram showing the mipmap levels written by auto-exposure.") @@ -542,8 +552,8 @@ The following table contains all resources consumed by the [Adjust input color]( | Name | Temporal layer | Resolution | Format | Type | Notes | | ----------------|-----------------|--------------|---------------------------|-----------|----------------------------------------------| -| Color buffer | Current frame | Render | `APPLICATION SPECIFIED` | Texture | The render resolution color buffer for the current frame provided by the application. If the contents of the color buffer are in high dynamic range (HDR), then the [`FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE`](src/ffx-fsr2-api/ffx_fsr2.h#L87) flag should be set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure. | -| Exposure | Current frame | 1x1 | ``R32_FLOAT`` | Texture | A 1x1 texture containing the exposure value computed for the current frame. This resource can be supplied by the application, or computed by the [Compute luminance pyramid](#compute-luminance-pyramid) stage of FSR2 if the [`FFX_FSR2_ENABLE_AUTO_EXPOSURE`](src/ffx-fsr2-api/ffx_fsr2.h#L92) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure. | +| Color buffer | Current frame | Render | `APPLICATION SPECIFIED` | Texture | The render resolution color buffer for the current frame provided by the application. If the contents of the color buffer are in high dynamic range (HDR), then the [`FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE`](src/ffx-fsr2-api/ffx_fsr2.h#L88) flag should be set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure. | +| Exposure | Current frame | 1x1 | ``R32_FLOAT`` | Texture | A 1x1 texture containing the exposure value computed for the current frame. This resource can be supplied by the application, or computed by the [Compute luminance pyramid](#compute-luminance-pyramid) stage of FSR2 if the [`FFX_FSR2_ENABLE_AUTO_EXPOSURE`](src/ffx-fsr2-api/ffx_fsr2.h#L93) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure. | ### Resource outputs The following table contains all resources produced or modified by the [Adjust input color](#Adjust-input-color) stage. @@ -567,7 +577,7 @@ As the luminance buffer is persistent (it is not available for aliasing, or clea | Green | n-2 | n - 1 | | Blue | n-3 | n - 2 | -The alpha channel of the luminance history buffer contains a measure of the stability of the luminance over the currrent frame, and the three frames that came before it. This is computed in the following way: +The alpha channel of the luminance history buffer contains a measure of the stability of the luminance over the current frame, and the three frames that came before it. This is computed in the following way: ``` HLSL float stabilityValue = 1.0f; @@ -592,8 +602,8 @@ The following table contains all of the resources which are required by the reco | Name | Temporal layer | Resolution | Format | Type | Notes | | ----------------------------|-----------------|------------|------------------------------------|-----------|------------------------------------------------| -| Depth buffer | Current frame | Render | `APPLICATION SPECIFIED (1x FLOAT)` | Texture | The render resolution depth buffer for the current frame provided by the application. The data should be provided as a single floating point value, the precision of which is under the application's control. The configuration of the depth should be communicated to FSR2 via the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L164). You should set the [`FFX_FSR2_ENABLE_DEPTH_INVERTED`](src/ffx-fsr2-api/ffx_fsr2.h#L90) flag if your depth buffer is inverted (that is [1..0] range), and you should set the flag if your depth buffer has as infinite far plane. If the application provides the depth buffer in `D32S8` format, then FSR2 will ignore the stencil component of the buffer, and create an `R32_FLOAT` resource to address the depth buffer. On GCN and RDNA hardware, depth buffers are stored separately from stencil buffers. | -| Motion vectors | Current fraame | Render or presentation | `APPLICATION SPECIFIED (2x FLOAT)` | Texture | The 2D motion vectors for the current frame provided by the application in [*(<-width, -height>*..**] range. If your application renders motion vectors with a different range, you may use the [`motionVectorScale`](src/ffx-fsr2-api/ffx_fsr2.h#L125) field of the [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L114) structure to adjust them to match the expected range for FSR2. Internally, FSR2 uses 16bit quantities to represent motion vectors in many cases, which means that while motion vectors with greater precision can be provided, FSR2 will not benefit from the increased precision. The resolution of the motion vector buffer should be equal to the render resolution, unless the [`FFX_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS`](src/ffx-fsr2-api/ffx_fsr2.h#L88) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L103) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L101) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L164), in which case it should be equal to the presentation resolution. | +| Depth buffer | Current frame | Render | `APPLICATION SPECIFIED (1x FLOAT)` | Texture | The render resolution depth buffer for the current frame provided by the application. The data should be provided as a single floating point value, the precision of which is under the application's control. The configuration of the depth should be communicated to FSR2 via the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L166). You should set the [`FFX_FSR2_ENABLE_DEPTH_INVERTED`](src/ffx-fsr2-api/ffx_fsr2.h#L91) flag if your depth buffer is inverted (that is [1..0] range), and you should set the flag if your depth buffer has as infinite far plane. If the application provides the depth buffer in `D32S8` format, then FSR2 will ignore the stencil component of the buffer, and create an `R32_FLOAT` resource to address the depth buffer. On GCN and RDNA hardware, depth buffers are stored separately from stencil buffers. | +| Motion vectors | Current fraame | Render or presentation | `APPLICATION SPECIFIED (2x FLOAT)` | Texture | The 2D motion vectors for the current frame provided by the application in [*(<-width, -height>*..**] range. If your application renders motion vectors with a different range, you may use the [`motionVectorScale`](src/ffx-fsr2-api/ffx_fsr2.h#L126) field of the [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L115) structure to adjust them to match the expected range for FSR2. Internally, FSR2 uses 16bit quantities to represent motion vectors in many cases, which means that while motion vectors with greater precision can be provided, FSR2 will not benefit from the increased precision. The resolution of the motion vector buffer should be equal to the render resolution, unless the [`FFX_FSR2_ENABLE_DISPLAY_RESOLUTION_MOTION_VECTORS`](src/ffx-fsr2-api/ffx_fsr2.h#L89) flag is set in the [`flags`](src/ffx-fsr2-api/ffx_fsr2.h#L104) field of the [`FfxFsr2ContextDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L102) structure when creating the [`FfxFsr2Context`](src/ffx-fsr2-api/ffx_fsr2.h#L166), in which case it should be equal to the presentation resolution. | ### Resource outputs The following table contains all of the resources which are produced by the reconstruct & dilate stage. @@ -609,7 +619,7 @@ The following table contains all of the resources which are produced by the reco ### Description The first step of the [Reconstruct & dilate](#reconstruct-and-dilate) stage is to compute the dilated depth values and motion vectors from the application's depth values and motion vectors for the current frame. Dilated depth values and motion vectors emphasise the edges of geometry which has been rendered into the depth buffer. This is because the edges of geometry will often introduce discontinuities into a contiguous series of depth values, meaning that as depth values and motion vectors are dilated, they will naturally follow the contours of the geometric edges present in the depth buffer. In order to compute the dilated depth values and motion vectors, FSR2 looks at the depth values for a 3x3 neighbourhood for each pixel and then selects the depth values and motion vectors in that neighbourhood where the depth value is nearest to the camera. In the diagram below, you can see how the central pixel of the 3x3 kernel is updated with the depth value and motion vectors from the pixel with the largest depth value - the pixel on the central, right hand side. -As this stage is the first time that motion vectors are consumed by FSR2, this is where motion vector scaling is applied if using the FSR2 host API. Motion vector scaling factors provided via the [`motionVectorScale`](src/ffx-fsr2-api/ffx_fsr2.h#L125) field of the [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L114) structure and allows you to transform non-screenspace motion vectors into screenspace motion vectors which FSR2 expects. +As this stage is the first time that motion vectors are consumed by FSR2, this is where motion vector scaling is applied if using the FSR2 host API. Motion vector scaling factors provided via the [`motionVectorScale`](src/ffx-fsr2-api/ffx_fsr2.h#L126) field of the [`FfxFsr2DispatchDescription`](src/ffx-fsr2-api/ffx_fsr2.h#L115) structure and allows you to transform non-screenspace motion vectors into screenspace motion vectors which FSR2 expects. ``` CPP // An example of how to manipulate motion vector scaling factors using the FSR2 host API. @@ -804,16 +814,21 @@ To build the FSR2 sample, please follow the following instructions: 3) Open the solutions in the DX12 or Vulkan directory (depending on your preference), compile and run. +# Limitations + +FSR 2 requires a GPU with typed UAV load support. + # Version history | Version | Date | Notes | | ---------------|-------------------|--------------------------------------------------------------| +| **2.1.0** | 2022-09-06 | Release of FidelityFX Super Resolution 2.1. | | **2.0.1** | 2022-06-22 | Initial release of FidelityFX Super Resolution 2.0. | # References [**Akeley-06**] Kurt Akeley and Jonathan Su, **"Minimum Triangle Separation for Correct Z-Buffer Occlusion"**, -[http://www.cs.cmu.edu/afs/cs/academic/class/15869-f11/www/readings/akeley06_triseparation.pdf](http://www.cs.cmu.edu/afs/cs/academic/class/15869-f11/www/readings/akeley06_triseparation.pdf) +[http://www.cs.cmu.edu/afs/cs/academic/class/15869-f11/www/readings/akeley06_triseparation.pdf](https://www.cs.cmu.edu/afs/cs/academic/class/15869-f11/www/readings/akeley06_triseparation.pdf) [**Lanczos**] Lanczos resampling, **"Lanczos resampling"**, [https://en.wikipedia.org/wiki/Lanczos_resampling](https://en.wikipedia.org/wiki/Lanczos_resampling) diff --git a/changelog.md b/changelog.md new file mode 100644 index 0000000..4bae377 --- /dev/null +++ b/changelog.md @@ -0,0 +1,19 @@ + +2022-09-06 | FidelityFX Super Resolution 2.1 +------- +- Reactivity mask now uses full range of value in the mask (0.0 - 1.0). +- Reactivity and Composition and Transparency mask dialation is now based on input colors to avoid expanding reactiveness into non-relevant upscaled areas. +- Disocclusion logic improved in order to detect disocclusions in areas with very small depth deparation. +- RCAS Pass forced to fp32 mode to reduce chance of issues seen with HDR input values. +- Fix for display-resolution motion vectors interpretation. +- FP16/FP32 computation review, readjusting balance of fp16/fp32 for maximum quality. +- Amended motion vector description within the documentation. +- Various documentation edits for spelling. +- Clarified the frame delta time input value within the readme documentation. +- Fixed issue with bad memset within the shader blob selection logic. + + +2022-06-22 | FidelityFX Super Resolution 2.0.1 +------- +- First release. + diff --git a/libs/cauldron b/libs/cauldron index 08e3881..b92d559 160000 --- a/libs/cauldron +++ b/libs/cauldron @@ -1 +1 @@ -Subproject commit 08e3881a04a0e207d65b4560d023c74c3775732e +Subproject commit b92d559bd083f44df9f8f42a6ad149c1584ae94c diff --git a/media/checkerboard.dds b/media/checkerboard.dds new file mode 100644 index 0000000..f1a6e7f Binary files /dev/null and b/media/checkerboard.dds differ diff --git a/media/composition_text.dds b/media/composition_text.dds new file mode 100644 index 0000000..4baf836 Binary files /dev/null and b/media/composition_text.dds differ diff --git a/media/lion.jpg b/media/lion.jpg new file mode 100644 index 0000000..e830649 Binary files /dev/null and b/media/lion.jpg differ diff --git a/release_notes.txt b/release_notes.txt index 3cc543b..deabc94 100644 --- a/release_notes.txt +++ b/release_notes.txt @@ -1,18 +1,29 @@ -FidelityFX Super Resolution 2.0.1 +FidelityFX Super Resolution 2.1 ================================= Features -------- -- The first release of FidelityFX Super Resolution 2.0. +- Reactivity mask interpretation has been modified to give game developers more levers to alleviate ghosting and other artefacts. +- Sample has example use of Reactivity mask. +- Sample has example use of Transparency and Composition mask. +- Sample has particles and animated textures. Changes ------- -- First release. +- Reactivity mask now uses full range of value in the mask (0.0 - 1.0). +- Reactivity and Composition and Transparency mask dialation is now based on input colors to avoid expanding reactiveness into non-relevant upscaled areas. +- Disocclusion logic improved in order to detect disocclusions in areas with very small depth deparation. +- RCAS Pass forced to fp32 mode to reduce chance of issues seen with HDR input values. +- Fix for display-resolution motion vectors interpretation. +- FP16/FP32 computation review, readjusting balance of fp16/fp32 for maximum quality. +- Amended motion vector description within the documentation. +- Various documentation edits for spelling. +- Clarified the frame delta time input value within the readme documentation. +- Fixed issue with bad memset within the shader blob selection logic. Limitations ----------- - The precise configuration and contents of the reactivity mask is subject to change in a future version of FSR2. -- Registered XBOX developers can refer to the GDK for an example of reactivity usage. Known issues ------------ diff --git a/src/DX12/AnimatedTexture.cpp b/src/DX12/AnimatedTexture.cpp new file mode 100644 index 0000000..b91b6e2 --- /dev/null +++ b/src/DX12/AnimatedTexture.cpp @@ -0,0 +1,183 @@ +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + + +#include "AnimatedTexture.h" + + +void AnimatedTextures::OnCreate( Device& device, UploadHeap& uploadHeap, StaticBufferPool& bufferPool, ResourceViewHeaps& resourceViewHeaps, DynamicBufferRing& constantBufferRing ) +{ + m_pResourceViewHeaps = &resourceViewHeaps; + m_constantBufferRing = &constantBufferRing; + + D3D12_SHADER_BYTECODE vs = {}; + D3D12_SHADER_BYTECODE ps = {}; + CompileShaderFromFile( "AnimatedTexture.hlsl", nullptr, "VSMain", "-T vs_6_0", &vs ); + CompileShaderFromFile( "AnimatedTexture.hlsl", nullptr, "PSMain", "-T ps_6_0", &ps ); + + CD3DX12_DESCRIPTOR_RANGE DescRange[1] = {}; + DescRange[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0 ); // t0 + + CD3DX12_ROOT_PARAMETER rootParamters[2] = {}; + rootParamters[0].InitAsDescriptorTable( 1, &DescRange[0], D3D12_SHADER_VISIBILITY_PIXEL ); // textures + rootParamters[1].InitAsConstantBufferView( 0, 0, D3D12_SHADER_VISIBILITY_ALL ); + + CD3DX12_STATIC_SAMPLER_DESC sampler( 0 ); + CD3DX12_ROOT_SIGNATURE_DESC descRootSignature = CD3DX12_ROOT_SIGNATURE_DESC(); + descRootSignature.Init( _countof(rootParamters), rootParamters, 1, &sampler ); + + // deny uneccessary access to certain pipeline stages + descRootSignature.Flags = D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT + | D3D12_ROOT_SIGNATURE_FLAG_DENY_HULL_SHADER_ROOT_ACCESS + | D3D12_ROOT_SIGNATURE_FLAG_DENY_DOMAIN_SHADER_ROOT_ACCESS + | D3D12_ROOT_SIGNATURE_FLAG_DENY_GEOMETRY_SHADER_ROOT_ACCESS; + + ID3DBlob *pOutBlob, *pErrorBlob = NULL; + ThrowIfFailed(D3D12SerializeRootSignature(&descRootSignature, D3D_ROOT_SIGNATURE_VERSION_1, &pOutBlob, &pErrorBlob)); + ThrowIfFailed(device.GetDevice()->CreateRootSignature(0, pOutBlob->GetBufferPointer(), pOutBlob->GetBufferSize(), IID_PPV_ARGS(&m_pRootSignature))); + SetName( m_pRootSignature, "AnimatedTexture" ); + + pOutBlob->Release(); + if (pErrorBlob) + pErrorBlob->Release(); + + D3D12_GRAPHICS_PIPELINE_STATE_DESC descPso = {}; + descPso.pRootSignature = m_pRootSignature; + descPso.VS = vs; + descPso.PS = ps; + descPso.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC(D3D12_DEFAULT); + descPso.DepthStencilState.DepthFunc = D3D12_COMPARISON_FUNC_GREATER_EQUAL; + descPso.DSVFormat = DXGI_FORMAT_D32_FLOAT; + descPso.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT); + descPso.RasterizerState.CullMode = D3D12_CULL_MODE_NONE; + descPso.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT); + descPso.BlendState.IndependentBlendEnable = true; + descPso.BlendState.RenderTarget[0].RenderTargetWriteMask = D3D12_COLOR_WRITE_ENABLE_ALL; + descPso.BlendState.RenderTarget[1].RenderTargetWriteMask = D3D12_COLOR_WRITE_ENABLE_RED | D3D12_COLOR_WRITE_ENABLE_GREEN; + descPso.BlendState.RenderTarget[2].RenderTargetWriteMask = 0x0; + descPso.BlendState.RenderTarget[3].RenderTargetWriteMask = D3D12_COLOR_WRITE_ENABLE_RED; + descPso.SampleMask = UINT_MAX; + descPso.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + descPso.NumRenderTargets = 4; + descPso.RTVFormats[0] = DXGI_FORMAT_R16G16B16A16_FLOAT; + descPso.RTVFormats[1] = DXGI_FORMAT_R16G16_FLOAT; + descPso.RTVFormats[2] = DXGI_FORMAT_R8_UNORM; + descPso.RTVFormats[3] = DXGI_FORMAT_R8_UNORM; + descPso.SampleDesc.Count = 1; + + ThrowIfFailed(device.GetDevice()->CreateGraphicsPipelineState(&descPso, IID_PPV_ARGS(&m_pPipelines[0]))); + SetName(m_pPipelines[0], "AnimatedTexturePipelineComp"); + + descPso.BlendState.RenderTarget[3].RenderTargetWriteMask = 0; + ThrowIfFailed(device.GetDevice()->CreateGraphicsPipelineState(&descPso, IID_PPV_ARGS(&m_pPipelines[1]))); + SetName(m_pPipelines[1], "AnimatedTexturePipelineNoComp"); + + UINT indices[6] = { 0, 1, 2, 2, 1, 3 }; + bufferPool.AllocIndexBuffer( _countof( indices ), sizeof( UINT ), indices, &m_indexBuffer ); + + resourceViewHeaps.AllocCBV_SRV_UAVDescriptor( _countof( m_textures ), &m_descriptorTable ); + + m_textures[0].InitFromFile( &device, &uploadHeap, "..\\media\\lion.jpg", true ); + m_textures[1].InitFromFile( &device, &uploadHeap, "..\\media\\checkerboard.dds", true ); + m_textures[2].InitFromFile( &device, &uploadHeap, "..\\media\\composition_text.dds", true ); + + for ( int i = 0; i < _countof( m_textures ); i++ ) + { + m_textures[ i ].CreateSRV( i, &m_descriptorTable ); + } +} + + +void AnimatedTextures::OnDestroy() +{ + for ( int i = 0; i < _countof( m_textures ); i++ ) + { + m_textures[i].OnDestroy(); + } + + for ( int i = 0; i < _countof( m_pPipelines ); i++ ) + { + m_pPipelines[i]->Release(); + m_pPipelines[i] = nullptr; + } + + m_pRootSignature->Release(); + m_pRootSignature = nullptr; + m_pResourceViewHeaps = nullptr; +} + + +void AnimatedTextures::Render( ID3D12GraphicsCommandList* pCommandList, float frameTime, float speed, bool compositionMask, const Camera& camera ) +{ + struct ConstantBuffer + { + math::Matrix4 currentViewProj; + math::Matrix4 previousViewProj; + float jitterCompensation[ 2 ]; + float scrollFactor; + float rotationFactor; + int mode; + int pads[3]; + }; + + m_scrollFactor += frameTime * 1.0f * speed; + m_rotationFactor += frameTime * 2.0f * speed; + m_flipTimer += frameTime * 1.0f; + + if ( m_scrollFactor > 10.0f ) + m_scrollFactor -= 10.0f; + + const float twoPI = 6.283185307179586476925286766559f; + + if ( m_rotationFactor > twoPI ) + m_rotationFactor -= twoPI; + + int textureIndex = min( (int)floorf( m_flipTimer * 0.33333f ), _countof( m_textures ) - 1 ); + if ( m_flipTimer > 9.0f ) + m_flipTimer = 0.0f; + + D3D12_GPU_VIRTUAL_ADDRESS cb = {}; + ConstantBuffer* constantBuffer = nullptr; + m_constantBufferRing->AllocConstantBuffer( sizeof(*constantBuffer), (void**)&constantBuffer, &cb ); + + constantBuffer->currentViewProj = camera.GetProjection() * camera.GetView(); + constantBuffer->previousViewProj = camera.GetPrevProjection() * camera.GetPrevView(); + + constantBuffer->jitterCompensation[0] = camera.GetPrevProjection().getCol2().getX() - camera.GetProjection().getCol2().getX(); + constantBuffer->jitterCompensation[1] = camera.GetPrevProjection().getCol2().getY() - camera.GetProjection().getCol2().getY(); + constantBuffer->scrollFactor = m_scrollFactor; + constantBuffer->rotationFactor = m_rotationFactor; + constantBuffer->mode = textureIndex; + + ID3D12DescriptorHeap* descriptorHeaps[] = { m_pResourceViewHeaps->GetCBV_SRV_UAVHeap(), m_pResourceViewHeaps->GetSamplerHeap() }; + pCommandList->SetDescriptorHeaps( _countof( descriptorHeaps ), descriptorHeaps ); + pCommandList->SetGraphicsRootSignature( m_pRootSignature ); + pCommandList->SetGraphicsRootDescriptorTable( 0, m_descriptorTable.GetGPU( textureIndex ) ); + pCommandList->SetGraphicsRootConstantBufferView( 1, cb ); + + pCommandList->IASetPrimitiveTopology( D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST ); + pCommandList->IASetIndexBuffer( &m_indexBuffer ); + pCommandList->IASetVertexBuffers( 0, 0, nullptr ); + pCommandList->SetPipelineState( m_pPipelines[compositionMask ? 0 : 1] ); + pCommandList->DrawIndexedInstanced( 6, 2, 0, 0, 0 ); +} + diff --git a/src/DX12/AnimatedTexture.h b/src/DX12/AnimatedTexture.h new file mode 100644 index 0000000..639b1c4 --- /dev/null +++ b/src/DX12/AnimatedTexture.h @@ -0,0 +1,56 @@ +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + +#pragma once + + +#include "stdafx.h" + + +class AnimatedTextures +{ +public: + + AnimatedTextures() {} + virtual ~AnimatedTextures() {} + + void OnCreate( Device& device, UploadHeap& uploadHeap, StaticBufferPool& bufferPool, ResourceViewHeaps& resourceViewHeaps, DynamicBufferRing& constantBufferRing ); + void OnDestroy(); + + void Render( ID3D12GraphicsCommandList* pCommandList, float frameTime, float speed, bool compositionMask, const Camera& camera ); + +private: + + ResourceViewHeaps* m_pResourceViewHeaps = nullptr; + DynamicBufferRing* m_constantBufferRing = nullptr; + + ID3D12RootSignature* m_pRootSignature = nullptr; + ID3D12PipelineState* m_pPipelines[2] = {}; + D3D12_INDEX_BUFFER_VIEW m_indexBuffer = {}; + + Texture m_textures[3] = {}; + CBV_SRV_UAV m_descriptorTable = {}; + + float m_scrollFactor = 0.0f; + float m_rotationFactor = 0.0f; + float m_flipTimer = 0.0f; +}; \ No newline at end of file diff --git a/src/DX12/AnimatedTexture.hlsl b/src/DX12/AnimatedTexture.hlsl new file mode 100644 index 0000000..0f708e2 --- /dev/null +++ b/src/DX12/AnimatedTexture.hlsl @@ -0,0 +1,129 @@ +// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + +cbuffer cb : register(b0) +{ + matrix g_CurrentViewProjection; + matrix g_PreviousViewProjection; + float2 g_CameraJitterCompensation; + float g_ScrollFactor; + float g_RotationFactor; + int g_Mode; + int pad0; + int pad1; + int pad2; +} + + +Texture2D g_Texture : register(t0); +SamplerState g_Sampler : register(s0); + +struct VERTEX_OUT +{ + float4 CurrentPosition : TEXCOORD0; + float4 PreviousPosition : TEXCOORD1; + float3 TexCoord : TEXCOORD2; + float4 Position : SV_POSITION; +}; + + +VERTEX_OUT VSMain( uint vertexId : SV_VertexID, uint instanceId : SV_InstanceID ) +{ + VERTEX_OUT output = (VERTEX_OUT)0; + + const float2 offsets[ 4 ] = + { + float2( -1, 1 ), + float2( 1, 1 ), + float2( -1, -1 ), + float2( 1, -1 ), + }; + + float2 offset = offsets[ vertexId ]; + float2 uv = (offset+1)*float2( instanceId == 0 ? -0.5 : 0.5, -0.5 ); + + float4 worldPos = float4( offsets[ vertexId ], 0.0, 1.0 ); + + worldPos.xyz += instanceId == 0 ? float3( -13, 1.5, 2 ) : float3( -13, 1.5, -2 ); + + output.CurrentPosition = mul( g_CurrentViewProjection, worldPos ); + output.PreviousPosition = mul( g_PreviousViewProjection, worldPos ); + + output.Position = output.CurrentPosition; + + output.TexCoord.xy = uv; + output.TexCoord.z = instanceId; + + return output; +} + +struct Output +{ + float4 finalColor : SV_TARGET0; + float2 motionVectors : SV_TARGET1; + float upscaleReactive : SV_TARGET2; + float upscaleTransparencyAndComposition : SV_TARGET3; +}; + + +float4 TextureLookup( int billboardIndex, float2 uv0 ) +{ + float4 color = 1; + + if ( billboardIndex == 0 || g_Mode == 2 ) + { + // Scrolling + float2 uv = uv0; + if ( g_Mode == 2 ) + uv += float2( -g_ScrollFactor, 0.0 ); + else + uv += float2( -g_ScrollFactor, 0.5*g_ScrollFactor ); + + color.rgb = g_Texture.SampleLevel( g_Sampler, uv, 0 ).rgb; + } + else if ( billboardIndex == 1 ) + { + // Rotated UVs + float s, c; + sincos( g_RotationFactor, s, c ); + float2x2 rotation = { float2( c, s ), float2( -s, c ) }; + + float2 rotatedUV = mul( rotation, uv0-float2( 0.5, -0.5) ); + color.rgb = g_Texture.SampleLevel( g_Sampler, rotatedUV, 0 ).rgb; + } + + return color; +} + + +Output PSMain( VERTEX_OUT input ) +{ + Output output = (Output)0; + + output.finalColor = TextureLookup( (int)input.TexCoord.z, input.TexCoord.xy ); + + output.motionVectors = (input.PreviousPosition.xy / input.PreviousPosition.w) - (input.CurrentPosition.xy / input.CurrentPosition.w) + g_CameraJitterCompensation; + output.motionVectors *= float2(0.5f, -0.5f); + + output.upscaleReactive = 0; // Nothing to write to the reactice mask. Color writes are off on this target anyway. + output.upscaleTransparencyAndComposition = 1; // Write a value into here to indicate the depth and motion vectors are as expected for a static object, but the surface contents are changing. + + return output; +} \ No newline at end of file diff --git a/src/DX12/CMakeLists.txt b/src/DX12/CMakeLists.txt index 1ad0f01..01330f1 100644 --- a/src/DX12/CMakeLists.txt +++ b/src/DX12/CMakeLists.txt @@ -38,6 +38,14 @@ set(sources stdafx.h UI.cpp UI.h + AnimatedTexture.cpp + AnimatedTexture.h + ../GpuParticles/ParticleHelpers.h + ../GpuParticles/ParticleSystem.h + ../GpuParticles/ParticleSystemInternal.h + ../GpuParticles/dx12/GPUParticleSystem.cpp + ../GpuParticles/dx12/ParallelSort.h + ../GpuParticles/dx12/ParallelSort.cpp dpiawarescaling.manifest) set(fsr1_shaders_src @@ -94,26 +102,40 @@ set(fsr2_shaders_src ${CMAKE_CURRENT_SOURCE_DIR}/../ffx-fsr2-api/shaders/ffx_fsr2_rcas.h ${CMAKE_CURRENT_SOURCE_DIR}/../ffx-fsr2-api/shaders/ffx_fsr2_autogen_reactive_pass.hlsl) -set(sample_shader_src - ${CMAKE_CURRENT_SOURCE_DIR}/UpscaleSpatial.hlsl - ${CMAKE_CURRENT_SOURCE_DIR}/FSRPass.hlsl +set(particle_shaders_src + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ParticleStructs.h + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ParticleHelpers.h + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/fp16util.h + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ParallelSortCS.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ParticleEmit.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ParticleRender.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ParticleSimulation.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ShaderConstants.h + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/SimulationBindings.h + ${CMAKE_CURRENT_SOURCE_DIR}/../ffx-parallelsort/FFX_ParallelSort.h) + +set(sample_shaders_src ${CMAKE_CURRENT_SOURCE_DIR}/GPUFrameRateLimiter.hlsl - ${CMAKE_CURRENT_SOURCE_DIR}/DebugBlit.hlsl) + ${CMAKE_CURRENT_SOURCE_DIR}/AnimatedTexture.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/DebugBlit.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/UpscaleSpatial.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/FSRPass.hlsl) set(APP_ICON_GPUOPEN "${CMAKE_CURRENT_SOURCE_DIR}/../common/GpuOpenIcon.rc") source_group("sources" FILES ${sources}) -source_group("shaders" FILES ${sample_shader_src}) -source_group("spd_shaders" FILES ${spd_shaders_src}) -source_group("fsr1_shaders" FILES ${fsr1_shaders_src}) +source_group("spatial_shaders" FILES ${fsr1_shaders_src}) source_group("fsr2_shaders" FILES ${fsr2_shaders_src}) +source_group("particle_shaders" FILES ${particle_shaders_src}) +source_group("sample_shaders" FILES ${sample_shaders_src}) copyCommand("${spd_shaders_src}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibDX) copyCommand("${fsr1_shaders_src}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibDX) copyCommand("${fsr2_shaders_src}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibDX) -copyCommand("${sample_shader_src}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibDX) +copyCommand("${particle_shaders_src}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibDX) +copyCommand("${sample_shaders_src}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibDX) -add_executable(FSR2_Sample_DX12 WIN32 ${sources} ${fsr1_shaders_src} ${spd_shaders_src} ${fsr2_shaders_src} ${sample_shader_src} ${common} ${APP_ICON_GPUOPEN}) +add_executable(FSR2_Sample_DX12 WIN32 ${sources} ${fsr2_src} ${sample_shaders_src} ${fsr1_shaders_src} ${fsr2_shaders_src} ${particle_shaders_src} ${spd_shaders_src} ${common} ${APP_ICON_GPUOPEN}) target_compile_definitions(FSR2_Sample_DX12 PRIVATE USE_PIX=1 $<$:FSR2_DEBUG_SHADERS=1>) target_link_libraries(FSR2_Sample_DX12 LINK_PUBLIC FSR2_Sample_Common Cauldron_DX12 ImGUI amd_ags d3dcompiler D3D12 ffx_fsr2_api_x64 ffx_fsr2_api_dx12_x64) target_include_directories(FSR2_Sample_DX12 PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../ffx-fsr2-api ${CMAKE_CURRENT_SOURCE_DIR}/../../libs) diff --git a/src/DX12/FSR2Sample.cpp b/src/DX12/FSR2Sample.cpp index 8556b5b..8712b78 100644 --- a/src/DX12/FSR2Sample.cpp +++ b/src/DX12/FSR2Sample.cpp @@ -51,7 +51,7 @@ void FSR2Sample::OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* // set some default values *pWidth = 1920; *pHeight = 1080; - m_activeScene = 0; //load the first one by default + m_UIState.m_activeScene = 0; //load the first one by default m_VsyncEnabled = false; m_bIsBenchmarking = false; m_fontSize = 13.f; // default value overridden by a json file if available @@ -66,7 +66,7 @@ void FSR2Sample::OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* *pWidth = jData.value("width", *pWidth); *pHeight = jData.value("height", *pHeight); m_fullscreenMode = jData.value("presentationMode", m_fullscreenMode); - m_activeScene = jData.value("activeScene", m_activeScene); + m_UIState.m_activeScene = jData.value("activeScene", m_UIState.m_activeScene); m_activeCamera = jData.value("activeCamera", m_activeCamera); m_isCpuValidationLayerEnabled = jData.value("CpuValidationLayerEnabled", m_isCpuValidationLayerEnabled); m_isGpuValidationLayerEnabled = jData.value("GpuValidationLayerEnabled", m_isGpuValidationLayerEnabled); @@ -774,7 +774,7 @@ int WINAPI WinMain(HINSTANCE hInstance, LPSTR lpCmdLine, int nCmdShow) { - LPCSTR Name = "FidelityFX Super Resolution 2.0"; + LPCSTR Name = "FidelityFX Super Resolution 2.1"; // create new DX sample return RunFramework(hInstance, lpCmdLine, nCmdShow, new FSR2Sample(Name)); diff --git a/src/DX12/FSR2Sample.h b/src/DX12/FSR2Sample.h index 188a3ad..f422309 100644 --- a/src/DX12/FSR2Sample.h +++ b/src/DX12/FSR2Sample.h @@ -77,7 +77,6 @@ private: // json config file json m_jsonConfigFile; std::vector m_sceneNames; - int m_activeScene; int m_activeCamera; bool m_bPlay; diff --git a/src/DX12/Renderer.cpp b/src/DX12/Renderer.cpp index d3d79b0..938302a 100644 --- a/src/DX12/Renderer.cpp +++ b/src/DX12/Renderer.cpp @@ -124,6 +124,9 @@ void Renderer::OnCreate(Device* pDevice, SwapChain *pSwapChain, float FontSize, // TAA m_ResourceViewHeaps.AllocCBV_SRV_UAVDescriptor(3, &m_UpscaleSRVs); + m_pGPUParticleSystem = IParticleSystem::CreateGPUSystem("..\\media\\atlas.dds"); + m_pGPUParticleSystem->OnCreateDevice(*pDevice, m_UploadHeap, m_ResourceViewHeaps, m_VidMemBufferPool, m_ConstantBufferRing); + m_GpuFrameRateLimiter.OnCreate(pDevice, &m_ResourceViewHeaps); // needs to be completely reinitialized, as the format potentially changes @@ -131,6 +134,8 @@ void Renderer::OnCreate(Device* pDevice, SwapChain *pSwapChain, float FontSize, DXGI_FORMAT mFormat = (hdr ? m_pGBufferHDRTexture->GetFormat() : DXGI_FORMAT_R8G8B8A8_UNORM); m_MagnifierPS.OnCreate(m_pDevice, &m_ResourceViewHeaps, &m_ConstantBufferRing, &m_VidMemBufferPool, mFormat); + m_AnimatedTextures.OnCreate( *pDevice, m_UploadHeap, m_VidMemBufferPool, m_ResourceViewHeaps, m_ConstantBufferRing ); + ResetScene(); } @@ -141,8 +146,13 @@ void Renderer::OnCreate(Device* pDevice, SwapChain *pSwapChain, float FontSize, //-------------------------------------------------------------------------------------- void Renderer::OnDestroy() { + m_AnimatedTextures.OnDestroy(); m_GpuFrameRateLimiter.OnDestroy(); + m_pGPUParticleSystem->OnDestroyDevice(); + delete m_pGPUParticleSystem; + m_pGPUParticleSystem = nullptr; + m_AsyncPool.Flush(); m_ImGUI.OnDestroy(); @@ -244,6 +254,8 @@ void Renderer::OnCreateWindowSizeDependentResources(SwapChain *pSwapChain, UISta m_MagnifierPS.OnCreateWindowSizeDependentResources(&m_displayOutput); + m_pGPUParticleSystem->OnResizedSwapChain(pState->renderWidth, pState->renderHeight, m_GBuffer.m_DepthBuffer); + // Lazy Upscale context generation: if ((m_pUpscaleContext == NULL) || (pState->m_nUpscaleType != m_pUpscaleContext->Type())) { @@ -276,6 +288,8 @@ void Renderer::OnCreateWindowSizeDependentResources(SwapChain *pSwapChain, UISta //-------------------------------------------------------------------------------------- void Renderer::OnDestroyWindowSizeDependentResources() { + m_pGPUParticleSystem->OnReleasingSwapChain(); + m_displayOutput.OnDestroy(); m_renderOutput.OnDestroy(); m_OpaqueTexture.OnDestroy(); @@ -502,7 +516,7 @@ void Renderer::AllocateShadowMaps(GLTFCommon* pGLTFCommon) std::vector::iterator CurrentShadow = m_shadowMapPool.begin(); for( uint32_t i = 0; CurrentShadow < m_shadowMapPool.end(); ++i, ++CurrentShadow) { - CurrentShadow->ShadowMap.InitDepthStencil(m_pDevice, "m_pShadowMap", &CD3DX12_RESOURCE_DESC::Tex2D(DXGI_FORMAT_D32_FLOAT, CurrentShadow->ShadowResolution, CurrentShadow->ShadowResolution, 1, 1, 1, 0, D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL), 0.0f); + CurrentShadow->ShadowMap.InitDepthStencil(m_pDevice, "m_pShadowMap", &CD3DX12_RESOURCE_DESC::Tex2D(DXGI_FORMAT_D32_FLOAT, CurrentShadow->ShadowResolution, CurrentShadow->ShadowResolution, 1, 1, 1, 0, D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL), 1.0f); CurrentShadow->ShadowMap.CreateDSV(CurrentShadow->ShadowIndex, &m_ShadowMapPoolDSV); CurrentShadow->ShadowMap.CreateSRV(CurrentShadow->ShadowIndex, &m_ShadowMapPoolSRV); } @@ -545,6 +559,7 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai m_pUpscaleContext->PreDraw(pState); static float fLightModHelper = 2.f; + float fLightMod = 1.f; // Sets the perFrame data per_frame *pPerFrame = NULL; @@ -568,6 +583,27 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai m_pGLTFTexturesAndBuffers->SetSkinningMatricesForSkeletons(); } + { + m_state.flags = IParticleSystem::PF_Streaks | IParticleSystem::PF_DepthCull | IParticleSystem::PF_Sort; + m_state.flags |= pState->nReactiveMaskMode == REACTIVE_MASK_MODE_ON ? IParticleSystem::PF_Reactive : 0; + + const Camera& camera = pState->camera; + m_state.constantData.m_ViewProjection = camera.GetProjection() * camera.GetView(); + m_state.constantData.m_View = camera.GetView(); + m_state.constantData.m_ViewInv = math::inverse(camera.GetView()); + m_state.constantData.m_Projection = camera.GetProjection(); + m_state.constantData.m_ProjectionInv = math::inverse(camera.GetProjection()); + m_state.constantData.m_SunDirection = math::Vector4(0.7f, 0.7f, 0, 0); + m_state.constantData.m_SunColor = math::Vector4(0.8f, 0.8f, 0.7f, 0); + m_state.constantData.m_AmbientColor = math::Vector4(0.2f, 0.2f, 0.3f, 0); + + m_state.constantData.m_SunColor *= fLightMod; + m_state.constantData.m_AmbientColor *= fLightMod; + + m_state.constantData.m_FrameTime = pState->m_bPlayAnimations ? (0.001f * (float)pState->deltaTime) : 0.0f; + PopulateEmitters(pState->m_bPlayAnimations, pState->m_activeScene, 0.001f * (float)pState->deltaTime); + } + // command buffer calls ID3D12GraphicsCommandList* pCmdLst1 = m_CommandListRing.GetNewCommandList(); @@ -664,6 +700,12 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai #else pGltfPbr->DrawBatchList(pCmdLst1, &m_ShadowMapPoolSRV, &opaque, bWireframe); #endif + + if (pState->bRenderAnimatedTextures) + { + m_AnimatedTextures.Render(pCmdLst1, pState->m_bPlayAnimations ? (0.001f * (float)pState->deltaTime) : 0.0f, pState->m_fTextureAnimationSpeed, pState->bCompositionMask, Cam); + } + m_GPUTimer.GetTimeStamp(pCmdLst1, "PBR Opaque"); pRenderPassFullGBuffer->EndPass(); } @@ -711,6 +753,13 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai { pRenderPassFullGBuffer->BeginPass(pCmdLst1, false); + if (pState->bRenderParticleSystem) + { + pCmdLst1->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(m_GBuffer.m_DepthBuffer.GetResource(), D3D12_RESOURCE_STATE_DEPTH_WRITE, D3D12_RESOURCE_STATE_DEPTH_READ | D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE | D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE)); + m_pGPUParticleSystem->Render(pCmdLst1, m_ConstantBufferRing, m_state.flags, m_state.emitters, m_state.numEmitters, m_state.constantData); + pCmdLst1->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(m_GBuffer.m_DepthBuffer.GetResource(), D3D12_RESOURCE_STATE_DEPTH_READ | D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE | D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE, D3D12_RESOURCE_STATE_DEPTH_WRITE)); + } + std::sort(transparent.begin(), transparent.end()); pGltfPbr->DrawBatchList(pCmdLst1, &m_ShadowMapPoolSRV, &transparent, bWireframe); m_GPUTimer.GetTimeStamp(pCmdLst1, "PBR Transparent"); @@ -757,6 +806,24 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai }; pCmdLst1->ResourceBarrier(1, preResolve); + // if FSR2 and auto reactive mask is enabled: generate reactive mask + if (pState->nReactiveMaskMode == REACTIVE_MASK_MODE_AUTOGEN) + { + UpscaleContext::FfxUpscaleSetup upscaleSetup; + upscaleSetup.cameraSetup.vCameraPos = pState->camera.GetPosition(); + upscaleSetup.cameraSetup.mCameraView = pState->camera.GetView(); + upscaleSetup.cameraSetup.mCameraViewInv = math::inverse(pState->camera.GetView()); + upscaleSetup.cameraSetup.mCameraProj = pState->camera.GetProjection(); + upscaleSetup.opaqueOnlyColorResource = m_OpaqueTexture.GetResource(); + upscaleSetup.unresolvedColorResource = m_GBuffer.m_HDR.GetResource(); + upscaleSetup.motionvectorResource = m_GBuffer.m_MotionVectors.GetResource(); + upscaleSetup.depthbufferResource = m_GBuffer.m_DepthBuffer.GetResource(); + upscaleSetup.reactiveMapResource = m_GBuffer.m_UpscaleReactive.GetResource(); + upscaleSetup.transparencyAndCompositionResource = m_GBuffer.m_UpscaleTransparencyAndComposition.GetResource(); + upscaleSetup.resolvedColorResource = m_displayOutput.GetResource(); + m_pUpscaleContext->GenerateReactiveMask(pCmdLst1, upscaleSetup, pState); + } + // Post proc--------------------------------------------------------------------------- // Bloom, takes HDR as input and applies bloom to it. @@ -979,12 +1046,101 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai void Renderer::ResetScene() { + ZeroMemory(m_EmissionRates, sizeof(m_EmissionRates)); + // Reset the particle system when the scene changes so no particles from the previous scene persist + m_pGPUParticleSystem->Reset(); } -void Renderer::PopulateEmitters(float frameTime) +void Renderer::PopulateEmitters(bool playAnimations, int activeScene, float frameTime) { - bool m_Paused = false; + IParticleSystem::EmitterParams sparksEmitter = {}; + IParticleSystem::EmitterParams smokeEmitter = {}; + + sparksEmitter.m_NumToEmit = 0; + sparksEmitter.m_ParticleLifeSpan = 1.0f; + sparksEmitter.m_StartSize = 0.6f * 0.02f; + sparksEmitter.m_EndSize = 0.4f * 0.02f; + sparksEmitter.m_VelocityVariance = 1.5f; + sparksEmitter.m_Mass = 1.0f; + sparksEmitter.m_TextureIndex = 1; + sparksEmitter.m_Streaks = true; + + smokeEmitter.m_NumToEmit = 0; + smokeEmitter.m_ParticleLifeSpan = 50.0f; + smokeEmitter.m_StartSize = 0.4f; + smokeEmitter.m_EndSize = 1.0f; + smokeEmitter.m_VelocityVariance = 1.0f; + smokeEmitter.m_Mass = 0.0003f; + smokeEmitter.m_TextureIndex = 0; + smokeEmitter.m_Streaks = false; + + if ( activeScene == 0 ) // scene 0 = warehouse + { + m_state.numEmitters = 2; + m_state.emitters[0] = sparksEmitter; + m_state.emitters[1] = sparksEmitter; + + m_state.emitters[0].m_Position = math::Vector4(-4.15f, -1.85f, -3.8f, 1.0f); + m_state.emitters[0].m_PositionVariance = math::Vector4(0.1f, 0.0f, 0.0f, 1.0f); + m_state.emitters[0].m_Velocity = math::Vector4(0.0f, 0.08f, 0.8f, 1.0f); + m_EmissionRates[0].m_ParticlesPerSecond = 300.0f; + + m_state.emitters[1].m_Position = math::Vector4(-4.9f, -1.5f, -4.8f, 1.0f); + m_state.emitters[1].m_PositionVariance = math::Vector4(0.0f, 0.0f, 0.0f, 1.0f); + m_state.emitters[1].m_Velocity = math::Vector4(0.0f, 0.8f, -0.8f, 1.0f); + m_EmissionRates[1].m_ParticlesPerSecond = 400.0f; + + m_state.constantData.m_StartColor[0] = math::Vector4(10.0f, 10.0f, 2.0f, 0.9f); + m_state.constantData.m_EndColor[0] = math::Vector4(10.0f, 10.0f, 0.0f, 0.1f); + m_state.constantData.m_StartColor[1] = math::Vector4(10.0f, 10.0f, 2.0f, 0.9f); + m_state.constantData.m_EndColor[1] = math::Vector4(10.0f, 10.0f, 0.0f, 0.1f); + } + else if ( activeScene == 1 ) // Sponza + { + m_state.numEmitters = 2; + m_state.emitters[0] = smokeEmitter; + m_state.emitters[1] = sparksEmitter; + + m_state.emitters[0].m_Position = math::Vector4(-13.0f, 0.0f, 1.4f, 1.0f); + m_state.emitters[0].m_PositionVariance = math::Vector4(0.1f, 0.0f, 0.1f, 1.0f); + m_state.emitters[0].m_Velocity = math::Vector4(0.0f, 0.2f, 0.0f, 1.0f); + m_EmissionRates[0].m_ParticlesPerSecond = 10.0f; + + m_state.emitters[1].m_Position = math::Vector4(-13.0f, 0.0f, -1.4f, 1.0f); + m_state.emitters[1].m_PositionVariance = math::Vector4(0.05f, 0.0f, 0.05f, 1.0f); + m_state.emitters[1].m_Velocity = math::Vector4(0.0f, 4.0f, 0.0f, 1.0f); + m_state.emitters[1].m_VelocityVariance = 0.5f; + m_state.emitters[1].m_StartSize = 0.02f; + m_state.emitters[1].m_EndSize = 0.02f; + m_state.emitters[1].m_Mass = 1.0f; + m_EmissionRates[1].m_ParticlesPerSecond = 500.0f; + + m_state.constantData.m_StartColor[0] = math::Vector4(0.3f, 0.3f, 0.3f, 0.4f); + m_state.constantData.m_EndColor[0] = math::Vector4(0.4f, 0.4f, 0.4f, 0.1f); + m_state.constantData.m_StartColor[1] = math::Vector4(10.0f, 10.0f, 10.0f, 0.9f); + m_state.constantData.m_EndColor[1] = math::Vector4(5.0f, 8.0f, 5.0f, 0.1f); + } + + // Update all our active emitters so we know how many whole numbers of particles to emit from each emitter this frame + for (int i = 0; i < m_state.numEmitters; i++) + { + m_state.constantData.m_EmitterLightingCenter[i] = m_state.emitters[ i ].m_Position; + + if (m_EmissionRates[i].m_ParticlesPerSecond > 0.0f) + { + m_EmissionRates[i].m_Accumulation += m_EmissionRates[i].m_ParticlesPerSecond * (playAnimations ? frameTime : 0.0f); + + if (m_EmissionRates[i].m_Accumulation > 1.0f) + { + float integerPart = 0.0f; + float fraction = modf(m_EmissionRates[i].m_Accumulation, &integerPart); + + m_state.emitters[i].m_NumToEmit = (int)integerPart; + m_EmissionRates[i].m_Accumulation = fraction; + } + } + } } diff --git a/src/DX12/Renderer.h b/src/DX12/Renderer.h index eb95ca3..d48da04 100644 --- a/src/DX12/Renderer.h +++ b/src/DX12/Renderer.h @@ -28,6 +28,10 @@ #include "UpscaleContext.h" #include "GPUFrameRateLimiter.h" +#include "../GpuParticles/ParticleSystem.h" +#include "../GpuParticleShaders/ShaderConstants.h" +#include "AnimatedTexture.h" + struct UIState; // We are queuing (backBufferCount + 0.5) frames, so we need to triple buffer the resources that get modified each frame @@ -60,11 +64,28 @@ public: void OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChain); - void ResetScene(); - void PopulateEmitters(float frameTime); void BuildDevUI(UIState* pState); private: + + struct State + { + float frameTime = 0.0f; + int numEmitters = 0; + IParticleSystem::EmitterParams emitters[10] = {}; + int flags = 0; + IParticleSystem::ConstantData constantData = {}; + }; + + struct EmissionRate + { + float m_ParticlesPerSecond = 0.0f; // Number of particles to emit per second + float m_Accumulation = 0.0f; // Running total of how many particles to emit over elapsed time + }; + + void ResetScene(); + void PopulateEmitters(bool playAnimations, int activeScene, float frameTime); + Device *m_pDevice; uint32_t m_Width; @@ -99,6 +120,13 @@ private: ColorConversionPS m_ColorConversionPS; MagnifierPS m_MagnifierPS; + // GPU Particle System + State m_state = {}; + IParticleSystem* m_pGPUParticleSystem = nullptr; + EmissionRate m_EmissionRates[NUM_EMITTERS] = {}; + + AnimatedTextures m_AnimatedTextures = {}; + // TAA CBV_SRV_UAV m_UpscaleSRVs; UpscaleContext* m_pUpscaleContext; diff --git a/src/DX12/UI.cpp b/src/DX12/UI.cpp index be15f54..0e625c1 100644 --- a/src/DX12/UI.cpp +++ b/src/DX12/UI.cpp @@ -73,7 +73,7 @@ void FSR2Sample::BuildUI() // if we haven't initialized GLTFLoader yet, don't draw UI. if (m_pGltfLoader == nullptr) { - LoadScene(m_activeScene); + LoadScene(m_UIState.m_activeScene); return; } @@ -123,12 +123,13 @@ void FSR2Sample::BuildUI() ImGui::Checkbox("Camera Headbobbing", &m_UIState.m_bHeadBobbing); auto getterLambda = [](void* data, int idx, const char** out_str)->bool { *out_str = ((std::vector *)data)->at(idx).c_str(); return true; }; - if (ImGui::Combo("Model", &m_activeScene, getterLambda, &m_sceneNames, (int)m_sceneNames.size())) + if (ImGui::Combo("Model", &m_UIState.m_activeScene, getterLambda, &m_sceneNames, (int)m_sceneNames.size())) { + m_UIState.bRenderAnimatedTextures = (m_UIState.m_activeScene == 1); // Note: // probably queueing this as an event and handling it at the end/beginning // of frame is a better idea rather than in the middle of drawing UI. - LoadScene(m_activeScene); + LoadScene(m_UIState.m_activeScene); //bail out as we need to reload everything ImGui::End(); @@ -220,14 +221,15 @@ void FSR2Sample::BuildUI() } - if (m_UIState.m_nUpscaleType == UPSCALE_TYPE_FSR_2_0) + if (ImGui::Checkbox("Dynamic resolution", &m_UIState.bDynamicRes)) { - if (ImGui::Checkbox("Dynamic resolution", &m_UIState.bDynamicRes)) { - OnResize(); - } + OnResize(); } - else - m_UIState.bDynamicRes = false; + + const char* reactiveOptions[] = { "Disabled", "Manual Reactive Mask Generation", "Autogen FSR2 Helper Function" }; + ImGui::Combo("Reactive Mask mode", (int*)(&m_UIState.nReactiveMaskMode), reactiveOptions, _countof(reactiveOptions)); + + ImGui::Checkbox("Use Transparency and Composition Mask", &m_UIState.bCompositionMask); } else if (m_UIState.m_nUpscaleType <= UPSCALE_TYPE_FSR_1_0) { @@ -258,6 +260,11 @@ void FSR2Sample::BuildUI() m_UIState.mipBias = mipBias[UPSCALE_QUALITY_MODE_NONE]; } + if (m_UIState.m_nUpscaleType != UPSCALE_TYPE_FSR_2_0) + { + m_UIState.bDynamicRes = false; + } + ImGui::Checkbox("RCAS Sharpening", &m_UIState.bUseRcas); if (m_UIState.m_nUpscaleType == UPSCALE_TYPE_FSR_2_0) { @@ -349,7 +356,7 @@ void FSR2Sample::BuildUI() if (ImGui::CollapsingHeader("Presentation Mode", ImGuiTreeNodeFlags_DefaultOpen)) { - const char* fullscreenModes[] = { "Windowed", "BorderlessFullscreen", "ExclusiveFulscreen" }; + const char* fullscreenModes[] = { "Windowed", "BorderlessFullscreen", "ExclusiveFullscreen" }; if (ImGui::Combo("Fullscreen Mode", (int*)&m_fullscreenMode, fullscreenModes, _countof(fullscreenModes))) { if (m_previousFullscreenMode != m_fullscreenMode) @@ -661,4 +668,4 @@ bool UIState::DevOption(float* pFloatValue, const char* name, float fMin, float void UIState::Text(const char* text) { ImGui::Text(text); -} \ No newline at end of file +} diff --git a/src/DX12/UI.h b/src/DX12/UI.h index 72a0319..37cc582 100644 --- a/src/DX12/UI.h +++ b/src/DX12/UI.h @@ -55,11 +55,25 @@ typedef enum UpscaleQualityMode { UPSCALE_QUALITY_MODE_COUNT } UpscaleQualityMode; +typedef enum ReactiveMaskMode { + REACTIVE_MASK_MODE_OFF = 0, // Nothing written to the reactive mask + REACTIVE_MASK_MODE_ON = 1, // Particles written to the reactive mask + REACTIVE_MASK_MODE_AUTOGEN = 2, // The mask is auto generated using FSR2's helper function + + // add above this. + REACTIVE_MASK_MODE_COUNT +} ReactiveMaskMode; + struct UIState { Camera camera; bool m_bHeadBobbing = false; + bool m_bPlayAnimations = true; + float m_fTextureAnimationSpeed = 1.0f; + int m_activeScene = 0; + bool m_bAnimateSpotlight = false; + // // WINDOW MANAGEMENT // @@ -72,7 +86,12 @@ struct UIState int SelectedTonemapperIndex; float Exposure; float ExposureHdr = 1.f; - bool bReset = false; + + bool bReset = false; + + int nLightModulationMode = 0; + bool bRenderParticleSystem = true; + bool bRenderAnimatedTextures = true; bool bUseMagnifier; bool bLockMagnifierPosition; bool bLockMagnifierPositionHistory; @@ -104,15 +123,19 @@ struct UIState unsigned int closestVelocitySamplePattern = 0; // 5 samples float Feedback = 15.f / 16.f; - // FSR2 auto reactive - bool bUseFsr2AutoReactive = false; + // FSR2 reactive mask + ReactiveMaskMode nReactiveMaskMode = REACTIVE_MASK_MODE_ON; float fFsr2AutoReactiveScale = 1.f; - float fFsr2AutoReactiveThreshold = 0.01f; + float fFsr2AutoReactiveThreshold = 0.2f; + float fFsr2AutoReactiveBinaryValue = 0.9f; bool bFsr2AutoReactiveTonemap = true; bool bFsr2AutoReactiveInverseTonemap = false; bool bFsr2AutoReactiveThreshold = true; bool bFsr2AutoReactiveUseMax = true; + // FSR2 composition mask + bool bCompositionMask = true; + // FSR2 debug out bool bUseDebugOut = false; int nDebugBlitSurface = 6; // FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR diff --git a/src/DX12/UpscaleContext_FSR2_API.cpp b/src/DX12/UpscaleContext_FSR2_API.cpp index bb62bb3..896a5b5 100644 --- a/src/DX12/UpscaleContext_FSR2_API.cpp +++ b/src/DX12/UpscaleContext_FSR2_API.cpp @@ -104,8 +104,11 @@ void UpscaleContext_FSR2_API::OnCreateWindowSizeDependentResources( initializationParameters.maxRenderSize.height = renderHeight; initializationParameters.displaySize.width = displayWidth; initializationParameters.displaySize.height = displayHeight; - initializationParameters.flags = FFX_FSR2_ENABLE_DEPTH_INVERTED - | FFX_FSR2_ENABLE_AUTO_EXPOSURE; + initializationParameters.flags = FFX_FSR2_ENABLE_AUTO_EXPOSURE; + + if (m_bInvertedDepth) { + initializationParameters.flags |= FFX_FSR2_ENABLE_DEPTH_INVERTED; + } if (hdr) { initializationParameters.flags |= FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE; @@ -130,8 +133,13 @@ void UpscaleContext_FSR2_API::OnCreateWindowSizeDependentResources( void UpscaleContext_FSR2_API::OnDestroyWindowSizeDependentResources() { UpscaleContext::OnDestroyWindowSizeDependentResources(); - ffxFsr2ContextDestroy(&context); - free(initializationParameters.callbacks.scratchBuffer); + // only destroy contexts which are live + if (initializationParameters.callbacks.scratchBuffer != nullptr) + { + ffxFsr2ContextDestroy(&context); + free(initializationParameters.callbacks.scratchBuffer); + initializationParameters.callbacks.scratchBuffer = nullptr; + } } void UpscaleContext_FSR2_API::BuildDevUI(UIState* pState) @@ -158,6 +166,7 @@ void UpscaleContext_FSR2_API::GenerateReactiveMask(ID3D12GraphicsCommandList* pC generateReactiveParameters.scale = pState->fFsr2AutoReactiveScale; generateReactiveParameters.cutoffThreshold = pState->fFsr2AutoReactiveThreshold; + generateReactiveParameters.binaryValue = pState->fFsr2AutoReactiveBinaryValue; generateReactiveParameters.flags = (pState->bFsr2AutoReactiveTonemap ? FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_TONEMAP :0) | (pState->bFsr2AutoReactiveInverseTonemap ? FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_INVERSETONEMAP : 0) | (pState->bFsr2AutoReactiveThreshold ? FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_THRESHOLD : 0) | @@ -174,8 +183,26 @@ void UpscaleContext_FSR2_API::Draw(ID3D12GraphicsCommandList* pCommandList, cons dispatchParameters.depth = ffxGetResourceDX12(&context, cameraSetup.depthbufferResource, L"FSR2_InputDepth"); dispatchParameters.motionVectors = ffxGetResourceDX12(&context, cameraSetup.motionvectorResource, L"FSR2_InputMotionVectors"); dispatchParameters.exposure = ffxGetResourceDX12(&context, nullptr, L"FSR2_InputExposure"); - dispatchParameters.reactive = ffxGetResourceDX12(&context, cameraSetup.reactiveMapResource, L"FSR2_InputReactiveMap"); - dispatchParameters.transparencyAndComposition = ffxGetResourceDX12(&context, cameraSetup.transparencyAndCompositionResource, L"FSR2_TransparencyAndCompositionMap"); + + if ((pState->nReactiveMaskMode == ReactiveMaskMode::REACTIVE_MASK_MODE_ON) + || (pState->nReactiveMaskMode == ReactiveMaskMode::REACTIVE_MASK_MODE_AUTOGEN)) + { + dispatchParameters.reactive = ffxGetResourceDX12(&context, cameraSetup.reactiveMapResource, L"FSR2_InputReactiveMap"); + } + else + { + dispatchParameters.reactive = ffxGetResourceDX12(&context, nullptr, L"FSR2_EmptyInputReactiveMap"); + } + + if (pState->bCompositionMask == true) + { + dispatchParameters.transparencyAndComposition = ffxGetResourceDX12(&context, cameraSetup.transparencyAndCompositionResource, L"FSR2_TransparencyAndCompositionMap"); + } + else + { + dispatchParameters.transparencyAndComposition = ffxGetResourceDX12(&context, nullptr, L"FSR2_EmptyTransparencyAndCompositionMap"); + } + dispatchParameters.output = ffxGetResourceDX12(&context, cameraSetup.resolvedColorResource, L"FSR2_OutputUpscaledColor", FFX_RESOURCE_STATE_UNORDERED_ACCESS); dispatchParameters.jitterOffset.x = m_JitterX; dispatchParameters.jitterOffset.y = m_JitterY; diff --git a/src/DX12/UpscaleContext_Spatial.cpp b/src/DX12/UpscaleContext_Spatial.cpp index 6198201..f517f04 100644 --- a/src/DX12/UpscaleContext_Spatial.cpp +++ b/src/DX12/UpscaleContext_Spatial.cpp @@ -72,7 +72,7 @@ void UpscaleContext_Spatial::OnCreate(const FfxUpscaleInitParams& initParams) CD3DX12_STATIC_SAMPLER_DESC sd[4] = {}; sd[0].Init(0, D3D12_FILTER_MIN_MAG_MIP_POINT, D3D12_TEXTURE_ADDRESS_MODE_CLAMP, D3D12_TEXTURE_ADDRESS_MODE_CLAMP); - sd[1].Init(1, D3D12_FILTER_MIN_MAG_MIP_POINT, D3D12_TEXTURE_ADDRESS_MODE_CLAMP, D3D12_TEXTURE_ADDRESS_MODE_CLAMP); + sd[1].Init(1, D3D12_FILTER_MIN_MAG_MIP_LINEAR, D3D12_TEXTURE_ADDRESS_MODE_CLAMP, D3D12_TEXTURE_ADDRESS_MODE_CLAMP); sd[2].Init(2, D3D12_FILTER_MIN_MAG_MIP_LINEAR, D3D12_TEXTURE_ADDRESS_MODE_CLAMP, D3D12_TEXTURE_ADDRESS_MODE_CLAMP); sd[3].Init(3, D3D12_FILTER_MIN_MAG_MIP_POINT, D3D12_TEXTURE_ADDRESS_MODE_CLAMP, D3D12_TEXTURE_ADDRESS_MODE_CLAMP); diff --git a/src/GpuParticleShaders/Globals.h b/src/GpuParticleShaders/Globals.h new file mode 100644 index 0000000..6c9fce6 --- /dev/null +++ b/src/GpuParticleShaders/Globals.h @@ -0,0 +1,92 @@ +// +// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#include "ShaderConstants.h" + + +#define FLOAT float +#define FLOAT2 float2 +#define FLOAT3 float3 +#define FLOAT4 float4 +#define FLOAT2X2 float2x2 +#define UINT uint +#define UINT2 uint2 +#define UINT3 uint3 +#define UINT4 uint4 + + +// Per-frame constant buffer +[[vk::binding( 10, 0 )]] cbuffer PerFrameConstantBuffer : register( b0 ) +{ + float4 g_StartColor[ NUM_EMITTERS ]; + float4 g_EndColor[ NUM_EMITTERS ]; + + float4 g_EmitterLightingCenter[ NUM_EMITTERS ]; + + matrix g_mViewProjection; + matrix g_mView; + matrix g_mViewInv; + matrix g_mProjection; + matrix g_mProjectionInv; + + float4 g_EyePosition; + float4 g_SunDirection; + float4 g_SunColor; + float4 g_AmbientColor; + + float4 g_SunDirectionVS; + + uint g_ScreenWidth; + uint g_ScreenHeight; + float g_InvScreenWidth; + float g_InvScreenHeight; + + float g_AlphaThreshold; + float g_ElapsedTime; + float g_CollisionThickness; + int g_CollideParticles; + + int g_ShowSleepingParticles; + int g_EnableSleepState; + float g_FrameTime; + int g_MaxParticles; + + uint g_NumTilesX; + uint g_NumTilesY; + uint g_NumCoarseCullingTilesX; + uint g_NumCoarseCullingTilesY; + + uint g_NumCullingTilesPerCoarseTileX; + uint g_NumCullingTilesPerCoarseTileY; + uint g_AlignedScreenWidth; + uint g_Pad1; +}; + + + + + +// Declare the global samplers +[[vk::binding( 12, 0 )]] SamplerState g_samWrapLinear : register( s0 ); +[[vk::binding( 13, 0 )]] SamplerState g_samClampLinear : register( s1 ); +[[vk::binding( 14, 0 )]] SamplerState g_samWrapPoint : register( s2 ); + diff --git a/src/GpuParticleShaders/ParallelSortCS.hlsl b/src/GpuParticleShaders/ParallelSortCS.hlsl new file mode 100644 index 0000000..9dfb928 --- /dev/null +++ b/src/GpuParticleShaders/ParallelSortCS.hlsl @@ -0,0 +1,123 @@ +// ParallelSortCS.hlsl +// +// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + +//-------------------------------------------------------------------------------------- +// ParallelSort Shaders/Includes +//-------------------------------------------------------------------------------------- +#define FFX_HLSL +#include "FFX_ParallelSort.h" + +[[vk::binding(0, 0)]] ConstantBuffer CBuffer : register(b0); // Constant buffer +[[vk::binding(0, 1)]] cbuffer SetupIndirectCB : register(b1) // Setup Indirect Constant buffer +{ + uint MaxThreadGroups; +}; + +struct RootConstantData +{ + uint CShiftBit; +}; + +#ifdef API_VULKAN +[[vk::push_constant]] RootConstantData rootConstData : register(b2); // Store the shift bit directly in the root signature +#else +ConstantBuffer rootConstData : register(b2); // Store the shift bit directly in the root signature +#endif + +[[vk::binding(0, 2)]] RWStructuredBuffer SrcBuffer : register(u0, space0); // The unsorted keys or scan data +[[vk::binding(2, 2)]] RWStructuredBuffer SrcPayload : register(u0, space1); // The payload data + +[[vk::binding(0, 4)]] RWStructuredBuffer SumTable : register(u0, space2); // The sum table we will write sums to +[[vk::binding(1, 4)]] RWStructuredBuffer ReduceTable : register(u0, space3); // The reduced sum table we will write sums to + +[[vk::binding(1, 2)]] RWStructuredBuffer DstBuffer : register(u0, space4); // The sorted keys or prefixed data +[[vk::binding(3, 2)]] RWStructuredBuffer DstPayload : register(u0, space5); // the sorted payload data + +[[vk::binding(0, 3)]] RWStructuredBuffer ScanSrc : register(u0, space6); // Source for Scan Data +[[vk::binding(1, 3)]] RWStructuredBuffer ScanDst : register(u0, space7); // Destination for Scan Data +[[vk::binding(2, 3)]] RWStructuredBuffer ScanScratch : register(u0, space8); // Scratch data for Scan + +[[vk::binding( 0, 5 )]] StructuredBuffer g_ElementCount : register( t0 ); +[[vk::binding(1, 5)]] RWStructuredBuffer CBufferUAV : register(u0, space10); // UAV for constant buffer parameters for indirect execution +[[vk::binding(2, 5)]] RWStructuredBuffer CountScatterArgs : register(u0, space11); // Count and Scatter Args for indirect execution +[[vk::binding(3, 5)]] RWStructuredBuffer ReduceScanArgs : register(u0, space12); // Reduce and Scan Args for indirect execution + + + +// FPS Count +[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] +void FPS_Count(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) +{ + // Call the uint version of the count part of the algorithm + FFX_ParallelSort_Count_uint(localID, groupID, CBuffer, rootConstData.CShiftBit, SrcBuffer, SumTable); +} + +// FPS Reduce +[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] +void FPS_CountReduce(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) +{ + // Call the reduce part of the algorithm + FFX_ParallelSort_ReduceCount(localID, groupID, CBuffer, SumTable, ReduceTable); +} + +// FPS Scan +[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] +void FPS_Scan(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) +{ + uint BaseIndex = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE * groupID; + FFX_ParallelSort_ScanPrefix(CBuffer.NumScanValues, localID, groupID, 0, BaseIndex, false, + CBuffer, ScanSrc, ScanDst, ScanScratch); +} + +// FPS ScanAdd +[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] +void FPS_ScanAdd(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) +{ + // When doing adds, we need to access data differently because reduce + // has a more specialized access pattern to match optimized count + // Access needs to be done similarly to reduce + // Figure out what bin data we are reducing + uint BinID = groupID / CBuffer.NumReduceThreadgroupPerBin; + uint BinOffset = BinID * CBuffer.NumThreadGroups; + + // Get the base index for this thread group + //uint BaseIndex = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE * (groupID / FFX_PARALLELSORT_SORT_BIN_COUNT); + uint BaseIndex = (groupID % CBuffer.NumReduceThreadgroupPerBin) * FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; + + FFX_ParallelSort_ScanPrefix(CBuffer.NumThreadGroups, localID, groupID, BinOffset, BaseIndex, true, + CBuffer, ScanSrc, ScanDst, ScanScratch); +} + +// FPS Scatter +[numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] +void FPS_Scatter(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) +{ + FFX_ParallelSort_Scatter_uint(localID, groupID, CBuffer, rootConstData.CShiftBit, SrcBuffer, DstBuffer, SumTable +#ifdef kRS_ValueCopy + ,SrcPayload, DstPayload +#endif // kRS_ValueCopy + ); +} + +[numthreads(1, 1, 1)] +void FPS_SetupIndirectParameters(uint localID : SV_GroupThreadID) +{ + FFX_ParallelSort_SetupIndirectParams(g_ElementCount[ 0 ], MaxThreadGroups, CBufferUAV, CountScatterArgs, ReduceScanArgs); +} \ No newline at end of file diff --git a/src/GpuParticleShaders/ParticleEmit.hlsl b/src/GpuParticleShaders/ParticleEmit.hlsl new file mode 100644 index 0000000..1a0123c --- /dev/null +++ b/src/GpuParticleShaders/ParticleEmit.hlsl @@ -0,0 +1,101 @@ +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#include "ParticleStructs.h" +#include "SimulationBindings.h" + + +// Emitter index has 8 bits +// Texture index has 5 bits +uint WriteEmitterProperties( uint emitterIndex, uint textureIndex, bool isStreakEmitter ) +{ + uint properties = 0; + + properties |= (emitterIndex & 0xff) << 16; + + properties |= ( textureIndex & 0x1f ) << 24; + + if ( isStreakEmitter ) + { + properties |= 1 << 30; + } + + return properties; +} + + +groupshared int g_ldsNumParticlesAvailable; + + +// Emit particles, one per thread, in blocks of 1024 at a time +[numthreads(1024,1,1)] +void CS_Emit( uint3 localIdx : SV_GroupThreadID, uint3 globalIdx : SV_DispatchThreadID ) +{ + if ( localIdx.x == 0 ) + { + int maxParticles = min( g_MaxParticlesThisFrame, g_MaxParticles ); + g_ldsNumParticlesAvailable = clamp( g_DeadList[ 0 ], 0, maxParticles ); + } + + GroupMemoryBarrierWithGroupSync(); + + // Check to make sure we don't emit more particles than we specified + if ( globalIdx.x < g_ldsNumParticlesAvailable ) + { + int numDeadParticles = 0; + InterlockedAdd( g_DeadList[ 0 ], -1, numDeadParticles ); + + if ( numDeadParticles > 0 && numDeadParticles <= g_MaxParticles ) + { + // Initialize the particle data to zero to avoid any unexpected results + GPUParticlePartA pa = (GPUParticlePartA)0; + GPUParticlePartB pb = (GPUParticlePartB)0; + + // Generate some random numbers from reading the random texture + float2 uv = float2( globalIdx.x / 1024.0, g_ElapsedTime ); + float3 randomValues0 = g_RandomBuffer.SampleLevel( g_samWrapPoint, uv, 0 ).xyz; + + float2 uv2 = float2( (globalIdx.x + 1) / 1024.0, g_ElapsedTime ); + float3 randomValues1 = g_RandomBuffer.SampleLevel( g_samWrapPoint, uv2, 0 ).xyz; + + float velocityMagnitude = length( g_vEmitterVelocity.xyz ); + + pb.m_Position = g_vEmitterPosition.xyz + ( randomValues0.xyz * g_PositionVariance.xyz ); + + pa.m_StreakLengthAndEmitterProperties = WriteEmitterProperties( g_EmitterIndex, g_TextureIndex, g_EmitterStreaks ? true : false ); + pa.m_CollisionCount = 0; + + pb.m_Mass = g_Mass; + pb.m_Velocity = g_vEmitterVelocity.xyz + ( randomValues1.xyz * velocityMagnitude * g_VelocityVariance ); + pb.m_Lifespan = g_ParticleLifeSpan; + pb.m_Age = pb.m_Lifespan; + pb.m_StartSize = g_StartSize; + pb.m_EndSize = g_EndSize; + + int index = g_DeadList[ numDeadParticles ]; + + // Write the new particle state into the global particle buffer + g_ParticleBufferA[ index ] = pa; + g_ParticleBufferB[ index ] = pb; + } + } +} diff --git a/src/GpuParticleShaders/ParticleHelpers.h b/src/GpuParticleShaders/ParticleHelpers.h new file mode 100644 index 0000000..c1ed0a0 --- /dev/null +++ b/src/GpuParticleShaders/ParticleHelpers.h @@ -0,0 +1,36 @@ +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + + + +float GetTextureOffset( uint emitterProperties ) +{ + uint index = (emitterProperties & 0x001f00000) >> 24; + + return (float)index * 0.5; // Assumes 2 textures in the atlas! +} + +bool IsStreakEmitter( uint emitterProperties ) +{ + return ( emitterProperties >> 30 ) & 0x01 ? true : false; +} + diff --git a/src/GpuParticleShaders/ParticleRender.hlsl b/src/GpuParticleShaders/ParticleRender.hlsl new file mode 100644 index 0000000..12ad49f --- /dev/null +++ b/src/GpuParticleShaders/ParticleRender.hlsl @@ -0,0 +1,263 @@ +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +// +// Shader code for rendering particles as simple quads using rasterization +// + +#include "ParticleStructs.h" +#include "ParticleHelpers.h" +#include "fp16util.h" + + +struct PS_INPUT +{ + nointerpolation float4 ViewSpaceCentreAndRadius : TEXCOORD0; + float2 TexCoord : TEXCOORD1; + float3 ViewPos : TEXCOORD2; + nointerpolation float3 VelocityXYEmitterNdotL : TEXCOORD3; + nointerpolation float3 Extrusion : TEXCOORD4; + nointerpolation float2 EllipsoidRadius : TEXCOORD5; + nointerpolation float4 Color : COLOR0; + float4 Position : SV_POSITION; +}; + + +// The particle buffer data. Note this is only one half of the particle data - the data that is relevant to rendering as opposed to simulation +[[vk::binding( 0, 0 )]] StructuredBuffer g_ParticleBufferA : register( t0 ); + +// A buffer containing the pre-computed view space positions of the particles +[[vk::binding( 1, 0 )]] StructuredBuffer g_PackedViewSpacePositions : register( t1 ); + +// The number of sorted particles +[[vk::binding( 2, 0 )]] StructuredBuffer g_NumParticlesBuffer : register( t2 ); + +// The sorted index list of particles +[[vk::binding( 3, 0 )]] StructuredBuffer g_SortedIndexBuffer : register( t3 ); + +// The texture atlas for the particles +[[vk::binding( 4, 0 )]] Texture2D g_ParticleTexture : register( t4 ); + +// The opaque scene depth buffer read as a texture +[[vk::binding( 5, 0 )]] Texture2D g_DepthTexture : register( t5 ); + +[[vk::binding( 6, 0 )]] cbuffer RenderingConstantBuffer : register( b0 ) +{ + matrix g_mProjection; + matrix g_mProjectionInv; + + float4 g_SunColor; + float4 g_AmbientColor; + float4 g_SunDirectionVS; + + uint g_ScreenWidth; + uint g_ScreenHeight; + uint g_pads0; + uint g_pads1; +}; + +[[vk::binding( 7, 0 )]] SamplerState g_samClampLinear : register( s0 ); + + +// Vertex shader only path +PS_INPUT VS_StructuredBuffer( uint VertexId : SV_VertexID ) +{ + PS_INPUT Output = (PS_INPUT)0; + + // Particle index + uint particleIndex = VertexId / 4; + + // Per-particle corner index + uint cornerIndex = VertexId % 4; + + float xOffset = 0; + + const float2 offsets[ 4 ] = + { + float2( -1, 1 ), + float2( 1, 1 ), + float2( -1, -1 ), + float2( 1, -1 ), + }; + + int NumParticles = g_NumParticlesBuffer[ 0 ]; + + int index = g_SortedIndexBuffer[ NumParticles - particleIndex - 1 ]; + + GPUParticlePartA pa = g_ParticleBufferA[ index ]; + + float4 ViewSpaceCentreAndRadius = UnpackFloat16( g_PackedViewSpacePositions[ index ] ); + float4 VelocityXYEmitterNdotLRotation = UnpackFloat16( pa.m_PackedVelocityXYEmitterNDotLAndRotation ); + + uint emitterProperties = pa.m_StreakLengthAndEmitterProperties; + + bool streaks = IsStreakEmitter( emitterProperties ); + + float2 offset = offsets[ cornerIndex ]; + float2 uv = (offset+1)*float2( 0.25, 0.5 ); + uv.x += GetTextureOffset( emitterProperties ); + + float radius = ViewSpaceCentreAndRadius.w; + float3 cameraFacingPos; + +#if defined (STREAKS) + if ( streaks ) + { + float2 viewSpaceVelocity = VelocityXYEmitterNdotLRotation.xy; + + float2 ellipsoidRadius = float2( radius, UnpackFloat16( pa.m_StreakLengthAndEmitterProperties ).x ); + + float2 extrusionVector = viewSpaceVelocity; + float2 tangentVector = float2( extrusionVector.y, -extrusionVector.x ); + float2x2 transform = float2x2( tangentVector, extrusionVector ); + + Output.Extrusion.xy = extrusionVector; + Output.Extrusion.z = 1.0; + Output.EllipsoidRadius = ellipsoidRadius; + + cameraFacingPos = ViewSpaceCentreAndRadius.xyz; + + cameraFacingPos.xy += mul( offset * ellipsoidRadius, transform ); + } + else +#endif + { + float s, c; + sincos( VelocityXYEmitterNdotLRotation.w, s, c ); + float2x2 rotation = { float2( c, -s ), float2( s, c ) }; + + offset = mul( offset, rotation ); + + cameraFacingPos = ViewSpaceCentreAndRadius.xyz; + cameraFacingPos.xy += radius * offset; + } + + Output.Position = mul( g_mProjection, float4( cameraFacingPos, 1 ) ); + + Output.TexCoord = uv; + Output.Color = UnpackFloat16( pa.m_PackedTintAndAlpha ); + Output.ViewSpaceCentreAndRadius = ViewSpaceCentreAndRadius; + Output.VelocityXYEmitterNdotL = VelocityXYEmitterNdotLRotation.xyz; + Output.ViewPos = cameraFacingPos; + + return Output; +} + + +struct PS_OUTPUT +{ + float4 color : SV_TARGET0; +#if defined (REACTIVE) + float reactiveMask : SV_TARGET2; +#endif +}; + + +// Ratserization path's pixel shader +PS_OUTPUT PS_Billboard( PS_INPUT In ) +{ + PS_OUTPUT output = (PS_OUTPUT)0; + + // Retrieve the particle data + float3 particleViewSpacePos = In.ViewSpaceCentreAndRadius.xyz; + float particleRadius = In.ViewSpaceCentreAndRadius.w; + + // Get the depth at this point in screen space + float depth = g_DepthTexture.Load( uint3( In.Position.x, In.Position.y, 0 ) ).x; + + // Get viewspace position by generating a point in screen space at the depth of the depth buffer + float4 viewSpacePos; + viewSpacePos.x = In.Position.x / (float)g_ScreenWidth; + viewSpacePos.y = 1 - ( In.Position.y / (float)g_ScreenHeight ); + viewSpacePos.xy = (2*viewSpacePos.xy) - 1; + viewSpacePos.z = depth; + viewSpacePos.w = 1; + + // ...then transform it into view space using the inverse projection matrix and a divide by W + viewSpacePos = mul( g_mProjectionInv, viewSpacePos ); + viewSpacePos.xyz /= viewSpacePos.w; + + // Calculate the depth fade factor + float depthFade = saturate( ( particleViewSpacePos.z - viewSpacePos.z ) / particleRadius ); + + float4 albedo = 1; + albedo.a = depthFade; + + // Read the texture atlas + albedo *= g_ParticleTexture.SampleLevel( g_samClampLinear, In.TexCoord, 0 ); // 2d + + // Multiply in the particle color + output.color = albedo * In.Color; + + // Calculate the UV based the screen space position + float3 n = 0; + float2 uv; +#if defined (STREAKS) + if ( In.Extrusion.z > 0.0 ) + { + float2 ellipsoidRadius = In.EllipsoidRadius; + + float2 extrusionVector = In.Extrusion.xy; + float2 tangentVector = float2( extrusionVector.y, -extrusionVector.x ); + float2x2 transform = float2x2( tangentVector, extrusionVector ); + + float2 vecToCentre = In.ViewPos.xy - particleViewSpacePos.xy; + vecToCentre = mul( transform, vecToCentre ); + + uv = vecToCentre / ellipsoidRadius; + } + else +#endif + { + uv = (In.ViewPos.xy - particleViewSpacePos.xy ) / particleRadius; + } + + // Scale and bias + uv = (1+uv)*0.5; + + float pi = 3.1415926535897932384626433832795; + + n.x = -cos( pi * uv.x ); + n.y = -cos( pi * uv.y ); + n.z = sin( pi * length( uv ) ); + n = normalize( n ); + + float ndotl = saturate( dot( g_SunDirectionVS.xyz, n ) ); + + // Fetch the emitter's lighting term + float emitterNdotL = In.VelocityXYEmitterNdotL.z; + + // Mix the particle lighting term with the emitter lighting + ndotl = lerp( ndotl, emitterNdotL, 0.5 ); + + // Ambient lighting plus directional lighting + float3 lighting = g_AmbientColor.rgb + ndotl * g_SunColor.rgb; + + // Multiply lighting term in + output.color.rgb *= lighting; + +#if defined (REACTIVE) + output.reactiveMask = max( output.color.r, max( output.color.g, output.color.b ) ) * albedo.a; +#endif + + return output; +} diff --git a/src/GpuParticleShaders/ParticleSimulation.hlsl b/src/GpuParticleShaders/ParticleSimulation.hlsl new file mode 100644 index 0000000..49cb4eb --- /dev/null +++ b/src/GpuParticleShaders/ParticleSimulation.hlsl @@ -0,0 +1,313 @@ +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +#include "ParticleStructs.h" +#include "fp16util.h" +#include "SimulationBindings.h" +#include "ParticleHelpers.h" + + + +uint GetEmitterIndex( uint emitterProperties ) +{ + return (emitterProperties >> 16) & 0xff; +} + + +bool IsSleeping( uint emitterProperties ) +{ + return ( emitterProperties >> 31 ) & 0x01 ? true : false; +} + + +uint SetIsSleepingBit( uint properties ) +{ + return properties | (1 << 31); +} + + +// Function to calculate the streak radius in X and Y given the particles radius and velocity +float2 calcEllipsoidRadius( float radius, float viewSpaceVelocitySpeed ) +{ + float radiusY = radius * max( 1.0, 0.1*viewSpaceVelocitySpeed ); + return float2( radius, radiusY ); +} + + +// Calculate the view space position given a point in screen space and a texel offset +float3 calcViewSpacePositionFromDepth( float2 normalizedScreenPosition, int2 texelOffset ) +{ + float2 uv; + + // Add the texel offset to the normalized screen position + normalizedScreenPosition.x += (float)texelOffset.x / (float)g_ScreenWidth; + normalizedScreenPosition.y += (float)texelOffset.y / (float)g_ScreenHeight; + + // Scale, bias and convert to texel range + uv.x = (0.5 + normalizedScreenPosition.x * 0.5) * (float)g_ScreenWidth; + uv.y = (1-(0.5 + normalizedScreenPosition.y * 0.5)) * (float)g_ScreenHeight; + + // Fetch the depth value at this point + float depth = g_DepthBuffer.Load( uint3( uv.x, uv.y, 0 ) ).x; + + // Generate a point in screen space with this depth + float4 viewSpacePosOfDepthBuffer; + viewSpacePosOfDepthBuffer.xy = normalizedScreenPosition.xy; + viewSpacePosOfDepthBuffer.z = depth; + viewSpacePosOfDepthBuffer.w = 1; + + // Transform into view space using the inverse projection matrix + viewSpacePosOfDepthBuffer = mul( g_mProjectionInv, viewSpacePosOfDepthBuffer ); + viewSpacePosOfDepthBuffer.xyz /= viewSpacePosOfDepthBuffer.w; + + return viewSpacePosOfDepthBuffer.xyz; +} + + +// Simulate 256 particles per thread group, one thread per particle +[numthreads(256,1,1)] +void CS_Simulate( uint3 id : SV_DispatchThreadID ) +{ + // Initialize the draw args and index buffer using the first thread in the Dispatch call + if ( id.x == 0 ) + { + g_DrawArgs[ 0 ].IndexCountPerInstance = 0; // Number of primitives reset to zero + g_DrawArgs[ 0 ].InstanceCount = 1; + g_DrawArgs[ 0 ].StartIndexLocation = 0; + g_DrawArgs[ 0 ].BaseVertexLocation = 0; + g_DrawArgs[ 0 ].StartInstanceLocation = 0; + + g_AliveParticleCount[ 0 ] = 0; + } + + // Wait after draw args are written so no other threads can write to them before they are initialized + GroupMemoryBarrierWithGroupSync(); + + const float3 vGravity = float3( 0.0, -9.81, 0.0 ); + + // Fetch the particle from the global buffer + GPUParticlePartA pa = g_ParticleBufferA[ id.x ]; + GPUParticlePartB pb = g_ParticleBufferB[ id.x ]; + + // If the partile is alive + if ( pb.m_Age > 0.0f ) + { + // Extract the individual emitter properties from the particle + uint emitterProperties = pa.m_StreakLengthAndEmitterProperties; + uint emitterIndex = GetEmitterIndex( emitterProperties ); + bool streaks = IsStreakEmitter( emitterProperties ); + float4 velocityXYEmitterNDotLAndRotation;// = UnpackFloat16( pa.m_PackedVelocityXYEmitterNDotLAndRotation ); + + // Age the particle by counting down from Lifespan to zero + pb.m_Age -= g_FrameTime; + + // Update the rotation + pa.m_Rotation += 0.24 * g_FrameTime; + + float3 vNewPosition = pb.m_Position; + + // Apply force due to gravity + if ( !IsSleeping( emitterProperties ) ) + { + pb.m_Velocity += pb.m_Mass * vGravity * g_FrameTime; + + // Apply a little bit of a wind force + float3 windDir = float3( 1, 1, 0 ); + float windStrength = 0.1; + + pb.m_Velocity += normalize( windDir ) * windStrength * g_FrameTime; + + // Calculate the new position of the particle + vNewPosition += pb.m_Velocity * g_FrameTime; + } + + // Calculate the normalized age + float fScaledLife = 1.0 - saturate( pb.m_Age / pb.m_Lifespan ); + + // Calculate the size of the particle based on age + float radius = lerp( pb.m_StartSize, pb.m_EndSize, fScaledLife ); + + // By default, we are not going to kill the particle + bool killParticle = false; + + if ( g_CollideParticles && g_FrameTime > 0.0 ) + { + // Transform new position into view space + float3 viewSpaceParticlePosition = mul( g_mView, float4( vNewPosition, 1 ) ).xyz; + + // Also obtain screen space position + float4 screenSpaceParticlePosition = mul( g_mViewProjection, float4( vNewPosition, 1 ) ); + screenSpaceParticlePosition.xyz /= screenSpaceParticlePosition.w; + + // Only do depth buffer collisions if the particle is onscreen, otherwise assume no collisions + if ( !IsSleeping( emitterProperties ) && screenSpaceParticlePosition.x > -1 && screenSpaceParticlePosition.x < 1 && screenSpaceParticlePosition.y > -1 && screenSpaceParticlePosition.y < 1 ) + { + // Get the view space position of the depth buffer + float3 viewSpacePosOfDepthBuffer = calcViewSpacePositionFromDepth( screenSpaceParticlePosition.xy, int2( 0, 0 ) ); + + // If the particle view space position is behind the depth buffer, but not by more than the collision thickness, then a collision has occurred + if ( ( viewSpaceParticlePosition.z < viewSpacePosOfDepthBuffer.z ) && ( viewSpaceParticlePosition.z > viewSpacePosOfDepthBuffer.z - g_CollisionThickness ) ) + { + // Generate the surface normal. Ideally, we would use the normals from the G-buffer as this would be more reliable than deriving them + float3 surfaceNormal; + + // Take three points on the depth buffer + float3 p0 = viewSpacePosOfDepthBuffer; + float3 p1 = calcViewSpacePositionFromDepth( screenSpaceParticlePosition.xy, int2( 1, 0 ) ); + float3 p2 = calcViewSpacePositionFromDepth( screenSpaceParticlePosition.xy, int2( 0, 1 ) ); + + // Generate the view space normal from the two vectors + float3 viewSpaceNormal = normalize( cross( p2 - p0, p1 - p0 ) ); + + // Transform into world space using the inverse view matrix + surfaceNormal = normalize( mul( g_mViewInv, -viewSpaceNormal ).xyz ); + + // The velocity is reflected in the collision plane + float3 newVelocity = reflect( pb.m_Velocity, surfaceNormal ); + + // Update the velocity and apply some restitution + pb.m_Velocity = 0.3*newVelocity; + + // Update the new collided position + vNewPosition = pb.m_Position + (pb.m_Velocity * g_FrameTime); + + pa.m_CollisionCount++; + } + } + } + + // Put particle to sleep if the velocity is small + if ( g_EnableSleepState && pa.m_CollisionCount > 10 && length( pb.m_Velocity ) < 0.01 ) + { + pa.m_StreakLengthAndEmitterProperties = SetIsSleepingBit( emitterProperties ); + } + + // If the position is below the floor, let's kill it now rather than wait for it to retire + if ( vNewPosition.y < -10 ) + { + killParticle = true; + } + + // Write the new position + pb.m_Position = vNewPosition; + + // Calculate the the distance to the eye for sorting in the rasterization path + float3 vec = vNewPosition - g_EyePosition.xyz; + pb.m_DistanceToEye = length( vec ); + + // Lerp the color based on the age + float4 color0 = g_StartColor[ emitterIndex ]; + float4 color1 = g_EndColor[ emitterIndex ]; + + float4 tintAndAlpha = 0; + + tintAndAlpha = lerp( color0, color1, saturate(4*fScaledLife) ); + tintAndAlpha.a = pb.m_Age <= 0 ? 0 : tintAndAlpha.a; + + if ( g_ShowSleepingParticles && IsSleeping( emitterProperties ) ) + { + tintAndAlpha.rgb = float3( 1, 0, 1 ); + } + + pa.m_PackedTintAndAlpha = PackFloat16( (min16float4)tintAndAlpha ); + + // The emitter-based lighting models the emitter as a vertical cylinder + float2 emitterNormal = normalize( vNewPosition.xz - g_EmitterLightingCenter[ emitterIndex ].xz ); + + // Generate the lighting term for the emitter + float emitterNdotL = saturate( dot( g_SunDirection.xz, emitterNormal ) + 0.5 ); + + // Transform the velocity into view space + float2 vsVelocity = mul( g_mView, float4( pb.m_Velocity.xyz, 0 ) ).xy; + float viewSpaceSpeed = 10 * length( vsVelocity ); + float streakLength = calcEllipsoidRadius( radius, viewSpaceSpeed ).y; + pa.m_StreakLengthAndEmitterProperties = PackFloat16( min16float2( streakLength, 0 ) ); + pa.m_StreakLengthAndEmitterProperties |= (0xffff0000 & emitterProperties); + + velocityXYEmitterNDotLAndRotation.xy = normalize( vsVelocity ); + velocityXYEmitterNDotLAndRotation.z = emitterNdotL; + velocityXYEmitterNDotLAndRotation.w = pa.m_Rotation; + + pa.m_PackedVelocityXYEmitterNDotLAndRotation = PackFloat16( (min16float4)velocityXYEmitterNDotLAndRotation ); + + // Pack the view spaced position and radius into a float4 buffer + float4 viewSpacePositionAndRadius; + + viewSpacePositionAndRadius.xyz = mul( g_mView, float4( vNewPosition, 1 ) ).xyz; + viewSpacePositionAndRadius.w = radius; + + g_PackedViewSpacePositions[ id.x ] = PackFloat16( (min16float4)viewSpacePositionAndRadius ); + + // For streaked particles (the sparks), calculate the the max radius in XY and store in a buffer + if ( streaks ) + { + float2 r2 = calcEllipsoidRadius( radius, viewSpaceSpeed ); + g_MaxRadiusBuffer[ id.x ] = max( r2.x, r2.y ); + } + else + { + // Not a streaked particle so will have rotation. When rotating, the particle has a max radius of the centre to the corner = sqrt( r^2 + r^2 ) + g_MaxRadiusBuffer[ id.x ] = 1.41 * radius; + } + + // Dead particles are added to the dead list for recycling + if ( pb.m_Age <= 0.0f || killParticle ) + { + pb.m_Age = -1; + + uint dstIdx = 0; + InterlockedAdd( g_DeadList[ 0 ], 1, dstIdx ); + g_DeadList[ dstIdx + 1 ] = id.x; + } + else + { + // Alive particles are added to the alive list + int index = 0; + InterlockedAdd( g_AliveParticleCount[ 0 ], 1, index ); + g_IndexBuffer[ index ] = id.x; + g_DistanceBuffer[ index ] = pb.m_DistanceToEye; + + uint dstIdx = 0; + // 6 indices per particle billboard + InterlockedAdd( g_DrawArgs[ 0 ].IndexCountPerInstance, 6, dstIdx ); + } + + // Write the particle data back to the global particle buffer + g_ParticleBufferA[ id.x ] = pa; + g_ParticleBufferB[ id.x ] = pb; + } +} + + +// Reset 256 particles per thread group, one thread per particle +// Also adds each particle to the dead list UAV +[numthreads(256,1,1)] +void CS_Reset( uint3 id : SV_DispatchThreadID, uint3 globalIdx : SV_DispatchThreadID ) +{ + if ( globalIdx.x == 0 ) + { + g_DeadList[ 0 ] = g_MaxParticles; + } + g_DeadList[ globalIdx.x + 1 ] = globalIdx.x; + + g_ParticleBufferA[ id.x ] = (GPUParticlePartA)0; + g_ParticleBufferB[ id.x ] = (GPUParticlePartB)0; +} diff --git a/src/GpuParticleShaders/ParticleStructs.h b/src/GpuParticleShaders/ParticleStructs.h new file mode 100644 index 0000000..0eb20b4 --- /dev/null +++ b/src/GpuParticleShaders/ParticleStructs.h @@ -0,0 +1,54 @@ +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + + +// Particle structures +// =================== + +struct GPUParticlePartA +{ + uint2 m_PackedTintAndAlpha; // The color and opacity + uint2 m_PackedVelocityXYEmitterNDotLAndRotation; // Normalized view space velocity XY used for streak extrusion. The lighting term for the while emitter in Z. The rotation angle in W. + + uint m_StreakLengthAndEmitterProperties; // 0-15: fp16 streak length + // 16-23: The index of the emitter + // 24-29: Atlas index + // 30: Whether or not the emitter supports velocity-based streaks + // 31: Whether or not the particle is sleeping (ie, don't update position) + float m_Rotation; // Uncompressed rotation - some issues with using fp16 rotation (also, saves unpacking it) + uint m_CollisionCount; // Keep track of how many times the particle has collided + uint m_pad; +}; + +struct GPUParticlePartB +{ + float3 m_Position; // World space position + float m_Mass; // Mass of particle + + float3 m_Velocity; // World space velocity + float m_Lifespan; // Lifespan of the particle. + + float m_DistanceToEye; // The distance from the particle to the eye + float m_Age; // The current age counting down from lifespan to zero + float m_StartSize; // The size at spawn time + float m_EndSize; // The time at maximum age +}; diff --git a/src/GpuParticleShaders/RenderScene.hlsl b/src/GpuParticleShaders/RenderScene.hlsl new file mode 100644 index 0000000..901c663 --- /dev/null +++ b/src/GpuParticleShaders/RenderScene.hlsl @@ -0,0 +1,109 @@ +// +// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#include "Globals.h" + + +struct VS_RenderSceneInput +{ + float3 f3Position : POSITION; + float3 f3Normal : NORMAL; + float2 f2TexCoord : TEXCOORD0; + float3 f3Tangent : TANGENT; +}; + +struct PS_RenderSceneInput +{ + float4 f4Position : SV_Position; + float2 f2TexCoord : TEXCOORD0; + float3 f3Normal : NORMAL; + float3 f3Tangent : TANGENT; + float3 f3WorldPos : TEXCOORD2; +}; + +Texture2D g_txDiffuse : register( t2 ); +Texture2D g_txNormal : register( t3 ); + +//================================================================================================================================= +// This shader computes standard transform and lighting +//================================================================================================================================= +PS_RenderSceneInput VS_RenderScene( VS_RenderSceneInput I ) +{ + PS_RenderSceneInput O; + + // Transform the position from object space to homogeneous projection space + O.f4Position = mul( float4( I.f3Position, 1.0f ), g_mViewProjection ); + + O.f3WorldPos = I.f3Position; + O.f3Normal = normalize( I.f3Normal ); + O.f3Tangent = normalize( I.f3Tangent ); + + // Pass through tex coords + O.f2TexCoord = I.f2TexCoord; + + return O; +} + + +//================================================================================================================================= +// This shader outputs the pixel's color by passing through the lit +// diffuse material color +//================================================================================================================================= +float4 PS_RenderScene( PS_RenderSceneInput I ) : SV_Target0 +{ + float4 f4Diffuse = g_txDiffuse.Sample( g_samWrapLinear, I.f2TexCoord ); + float fSpecMask = f4Diffuse.a; + float3 f3Norm = g_txNormal.Sample( g_samWrapLinear, I.f2TexCoord ).xyz; + f3Norm *= 2.0f; + f3Norm -= float3( 1.0f, 1.0f, 1.0f ); + + float3 f3Binorm = normalize( cross( I.f3Normal, I.f3Tangent ) ); + float3x3 f3x3BasisMatrix = float3x3( f3Binorm, I.f3Tangent, I.f3Normal ); + f3Norm = normalize( mul( f3Norm, f3x3BasisMatrix ) ); + + // Diffuse lighting + float4 f4Lighting = saturate( dot( f3Norm, g_SunDirection.xyz ) ) * g_SunColor; + f4Lighting += g_AmbientColor; + + // Calculate specular power + float3 f3ViewDir = normalize( g_EyePosition.xyz - I.f3WorldPos ); + float3 f3HalfAngle = normalize( f3ViewDir + g_SunDirection.xyz ); + float4 f4SpecPower1 = pow( saturate( dot( f3HalfAngle, f3Norm ) ), 32 ) * g_SunColor; + + return f4Lighting * f4Diffuse + ( f4SpecPower1 * fSpecMask ); +} + + + +//-------------------------------------------------------------------------------------- +// PS for the sky +//-------------------------------------------------------------------------------------- +float4 PS_Sky( PS_RenderSceneInput I ) : SV_Target +{ + float4 f4O; + + // Bog standard textured rendering + f4O.xyz = g_txDiffuse.Sample( g_samWrapLinear, I.f2TexCoord ).xyz; + f4O.w = 1.0f; + + return f4O; +} \ No newline at end of file diff --git a/src/GpuParticleShaders/ShaderConstants.h b/src/GpuParticleShaders/ShaderConstants.h new file mode 100644 index 0000000..fa847b0 --- /dev/null +++ b/src/GpuParticleShaders/ShaderConstants.h @@ -0,0 +1,26 @@ +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +// This file is shared between the HLSL and C++ code for convenience + +// Maximum number of emitters supported +#define NUM_EMITTERS 4 diff --git a/src/GpuParticleShaders/SimulationBindings.h b/src/GpuParticleShaders/SimulationBindings.h new file mode 100644 index 0000000..d898992 --- /dev/null +++ b/src/GpuParticleShaders/SimulationBindings.h @@ -0,0 +1,121 @@ +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + + +#include "ShaderConstants.h" + + +// The particle buffers to fill with new particles +[[vk::binding( 0, 0 )]] RWStructuredBuffer g_ParticleBufferA : register( u0 ); +[[vk::binding( 1, 0 )]] RWStructuredBuffer g_ParticleBufferB : register( u1 ); + +// The dead list, so any particles that are retired this frame can be added to this list. The first element is the number of dead particles +[[vk::binding( 2, 0 )]] RWStructuredBuffer g_DeadList : register( u2 ); + +// The alive list which gets built in the similution. The distances also get written out +[[vk::binding( 3, 0 )]] RWStructuredBuffer g_IndexBuffer : register( u3 ); +[[vk::binding( 4, 0 )]] RWStructuredBuffer g_DistanceBuffer : register( u4 ); + +// The maximum radius in XY is calculated here and stored +[[vk::binding( 5, 0 )]] RWStructuredBuffer g_MaxRadiusBuffer : register( u5 ); + +// Viewspace particle positions are calculated here and stored +[[vk::binding( 6, 0 )]] RWStructuredBuffer g_PackedViewSpacePositions : register( u6 ); + +// The draw args for the ExecuteIndirect call needs to be filled in before the rasterization path is called, so do it here +struct IndirectCommand +{ +#ifdef API_DX12 + uint2 uav; +#endif + uint IndexCountPerInstance; + uint InstanceCount; + uint StartIndexLocation; + int BaseVertexLocation; + uint StartInstanceLocation; +}; +[[vk::binding( 7, 0 )]] RWStructuredBuffer g_DrawArgs : register( u7 ); + +[[vk::binding( 8, 0 )]] RWStructuredBuffer g_AliveParticleCount : register( u8 ); + +// The opaque scene's depth buffer read as a texture +[[vk::binding( 9, 0 )]] Texture2D g_DepthBuffer : register( t0 ); + +// A texture filled with random values for generating some variance in our particles when we spawn them +[[vk::binding( 10, 0 )]] Texture2D g_RandomBuffer : register( t1 ); + + +// Per-frame constant buffer +[[vk::binding( 11, 0 )]] cbuffer SimulationConstantBuffer : register( b0 ) +{ + float4 g_StartColor[ NUM_EMITTERS ]; + float4 g_EndColor[ NUM_EMITTERS ]; + + float4 g_EmitterLightingCenter[ NUM_EMITTERS ]; + + matrix g_mViewProjection; + matrix g_mView; + matrix g_mViewInv; + matrix g_mProjectionInv; + + float4 g_EyePosition; + float4 g_SunDirection; + + uint g_ScreenWidth; + uint g_ScreenHeight; + float g_ElapsedTime; + float g_CollisionThickness; + + int g_CollideParticles; + int g_ShowSleepingParticles; + int g_EnableSleepState; + float g_FrameTime; + + int g_MaxParticles; + uint g_Pad0; + uint g_Pad1; + uint g_Pad2; +}; + +[[vk::binding( 12, 0 )]] cbuffer EmitterConstantBuffer : register( b1 ) +{ + float4 g_vEmitterPosition; + float4 g_vEmitterVelocity; + float4 g_PositionVariance; + + int g_MaxParticlesThisFrame; + float g_ParticleLifeSpan; + float g_StartSize; + float g_EndSize; + + float g_VelocityVariance; + float g_Mass; + uint g_EmitterIndex; + uint g_EmitterStreaks; + + uint g_TextureIndex; + uint g_pads0; + uint g_pads1; + uint g_pads2; +}; + +[[vk::binding( 13, 0 )]] SamplerState g_samWrapPoint : register( s0 ); diff --git a/src/GpuParticleShaders/fp16util.h b/src/GpuParticleShaders/fp16util.h new file mode 100644 index 0000000..2b4df0f --- /dev/null +++ b/src/GpuParticleShaders/fp16util.h @@ -0,0 +1,169 @@ +// HLSL intrinsics +// cross +min16float3 RTGCross(min16float3 a, min16float3 b) +{ + return min16float3( + a.y * b.z - a.z * b.y, + a.z * b.x - a.x * b.z, + a.x * b.y - a.y * b.x); +} + +// dot +min16float RTGDot2(min16float2 a, min16float2 b) +{ + return a.x * b.x + a.y * b.y; +} + +min16float RTGDot3(min16float3 a, min16float3 b) +{ + return a.x * b.x + a.y * b.y + + a.z * b.z; +} + +min16float RTGDot4(min16float4 a, min16float4 b) +{ + return a.x * b.x + a.y * b.y + + a.z * b.z + a.w * b.w; +} + +// length +min16float RTGLength2(min16float2 a) +{ + return sqrt(RTGDot2(a, a)); +} + +min16float RTGLength3(min16float3 a) +{ + return sqrt(RTGDot3(a, a)); +} + +min16float RTGLength4(min16float4 a) +{ + return sqrt(RTGDot4(a, a)); +} + +// normalize +min16float2 RTGNormalize2(min16float2 a) +{ + min16float l = RTGLength2(a); + return l == 0.0 ? a : a / l; +} + +min16float3 RTGNormalize3(min16float3 a) +{ + min16float l = RTGLength3( a ); + return l == 0.0 ? a : a / l; +} + +min16float4 RTGNormalize4(min16float4 a) +{ + min16float l = RTGLength4( a ); + return l == 0.0 ? a : a / l; +} + + +// distance +min16float RTGDistance2(min16float2 from, min16float2 to) +{ + return RTGLength2(to - from); +} + +min16float RTGDistance3(min16float3 from, min16float3 to) +{ + return RTGLength3(to - from); +} + +min16float RTGDistance4(min16float4 from, min16float4 to) +{ + return RTGLength4(to - from); +} + + +// Packing and Unpacking +// min16{u}int2 +int PackInt16(min16int2 v) +{ + uint x = asuint(int(v.x)); + uint y = asuint(int(v.y)); + return asint(x | y << 16); +} + +uint PackInt16(min16uint2 v) +{ + return uint(v.x | (uint)(v.y) << 16); +} + +min16int2 UnpackInt16(int v) +{ + uint x = asuint(v.x) & 0xFFFF; + uint y = asuint(v.x) >> 16; + return min16uint2(asint(x), + asint(y)); +} + +min16uint2 UnpackInt16(uint v) +{ + return min16uint2(v.x & 0xFFFF, + v.x >> 16); +} + +// min16{u}int4 +int2 PackInt16(min16int4 v) +{ + return int2(PackInt16(v.xy), + PackInt16(v.zw)); +} + +uint2 PackInt16(min16uint4 v) +{ + return uint2(PackInt16(v.xy), + PackInt16(v.zw)); +} + +min16int4 UnpackInt16(int2 v) +{ + return min16int4(UnpackInt16(v.x), + UnpackInt16(v.y)); +} + +min16uint4 UnpackInt16(uint2 v) +{ + return min16uint4(UnpackInt16(v.x), + UnpackInt16(v.y)); +} + +uint PackFloat16( min16float v ) +{ + uint p = f32tof16( v ); + return p.x; +} + +// min16float2 +uint PackFloat16(min16float2 v) +{ + uint2 p = f32tof16(float2(v)); + return p.x | (p.y << 16); +} + +min16float2 UnpackFloat16(uint a) +{ + float2 tmp = f16tof32( + uint2(a & 0xFFFF, a >> 16)); + return min16float2(tmp); +} + + +// min16float4 +uint2 PackFloat16(min16float4 v) +{ + return uint2(PackFloat16(v.xy), + PackFloat16(v.zw)); +} + +min16float4 UnpackFloat16(uint2 v) +{ + return min16float4( + UnpackFloat16(v.x), + UnpackFloat16(v.y) + ); +} \ No newline at end of file diff --git a/src/GpuParticles/ParticleHelpers.h b/src/GpuParticles/ParticleHelpers.h new file mode 100644 index 0000000..3133630 --- /dev/null +++ b/src/GpuParticles/ParticleHelpers.h @@ -0,0 +1,36 @@ +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +#pragma once + +inline float RandomVariance( float median, float variance ) +{ + float fUnitRandomValue = (float)rand() / (float)RAND_MAX; + float fRange = variance * fUnitRandomValue; + return median - variance + (2.0f * fRange); +} + +inline float RandomFromAndTo( float lowest, float highest ) +{ + float fUnitRandomValue = (float)rand() / (float)RAND_MAX; + float fRange = (highest - lowest) * fUnitRandomValue; + return lowest + fRange; +} \ No newline at end of file diff --git a/src/GpuParticles/ParticleSystem.h b/src/GpuParticles/ParticleSystem.h new file mode 100644 index 0000000..2658135 --- /dev/null +++ b/src/GpuParticles/ParticleSystem.h @@ -0,0 +1,93 @@ +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +#pragma once + +#include "stdafx.h" + +// Implementation-agnostic particle system interface +struct IParticleSystem +{ + enum Flags + { + PF_Sort = 1 << 0, // Sort the particles + PF_DepthCull = 1 << 1, // Do per-tile depth buffer culling + PF_Streaks = 1 << 2, // Streak the particles based on velocity + PF_Reactive = 1 << 3 // Particles also write to the reactive mask + }; + + // Per-emitter parameters + struct EmitterParams + { + math::Vector4 m_Position = {}; // World position of the emitter + math::Vector4 m_Velocity = {}; // Velocity of each particle from the emitter + math::Vector4 m_PositionVariance = {}; // Variance in position of each particle + int m_NumToEmit = 0; // Number of particles to emit this frame + float m_ParticleLifeSpan = 0.0f; // How long the particles should live + float m_StartSize = 0.0f; // Size of particles at spawn time + float m_EndSize = 0.0f; // Size of particle when they reach retirement age + float m_Mass = 0.0f; // Mass of particle + float m_VelocityVariance = 0.0f; // Variance in velocity of each particle + int m_TextureIndex = 0; // Index of the texture in the atlas + bool m_Streaks = false; // Streak the particles in the direction of travel + }; + + struct ConstantData + { + math::Matrix4 m_ViewProjection = {}; + math::Matrix4 m_View = {}; + math::Matrix4 m_ViewInv = {}; + math::Matrix4 m_Projection = {}; + math::Matrix4 m_ProjectionInv = {}; + + math::Vector4 m_StartColor[ 10 ] = {}; + math::Vector4 m_EndColor[ 10 ] = {}; + math::Vector4 m_EmitterLightingCenter[ 10 ] = {}; + + math::Vector4 m_SunDirection = {}; + math::Vector4 m_SunColor = {}; + math::Vector4 m_AmbientColor = {}; + + float m_FrameTime = 0.0f; + }; + + // Create a GPU particle system. Add more factory functions to create other types of system eg CPU-updated system + static IParticleSystem* CreateGPUSystem( const char* particleAtlas ); + + virtual ~IParticleSystem() {} + +#ifdef API_DX12 + virtual void Render( ID3D12GraphicsCommandList* pCommandList, DynamicBufferRing& constantBufferRing, int flags, const EmitterParams* pEmitters, int nNumEmitters, const ConstantData& constantData ) = 0; + virtual void OnCreateDevice( Device &device, UploadHeap& uploadHeap, ResourceViewHeaps& heaps, StaticBufferPool& bufferPool, DynamicBufferRing& constantBufferRing ) = 0; + virtual void OnResizedSwapChain( int width, int height, Texture& depthBuffer ) = 0; +#endif +#ifdef API_VULKAN + virtual void Render( VkCommandBuffer commandBuffer, DynamicBufferRing& constantBufferRing, int contextFlags, const EmitterParams* pEmitters, int nNumEmitters, const ConstantData& constantData ) = 0; + virtual void OnCreateDevice( Device &device, UploadHeap& uploadHeap, ResourceViewHeaps& heaps, StaticBufferPool& bufferPool, DynamicBufferRing& constantBufferRing, VkRenderPass renderPass ) = 0; + virtual void OnResizedSwapChain( int width, int height, Texture& depthBuffer, VkFramebuffer frameBuffer ) = 0; +#endif + + virtual void OnReleasingSwapChain() = 0; + virtual void OnDestroyDevice() = 0; + + // Completely resets the state of all particles. Handy for changing scenes etc + virtual void Reset() = 0; +}; diff --git a/src/GpuParticles/ParticleSystemInternal.h b/src/GpuParticles/ParticleSystemInternal.h new file mode 100644 index 0000000..c4c94eb --- /dev/null +++ b/src/GpuParticles/ParticleSystemInternal.h @@ -0,0 +1,154 @@ +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +#pragma once + +#include "stdafx.h" +#include "../GpuParticleShaders/ShaderConstants.h" +#include "ParticleSystem.h" + + +// Helper function to align values +int align( int value, int alignment ) { return ( value + (alignment - 1) ) & ~(alignment - 1); } + + +// GPUParticle structure is split into two sections for better cache efficiency - could even be SOA but would require creating more vertex buffers. +struct GPUParticlePartA +{ + math::Vector4 m_params[ 2 ]; +}; + +struct GPUParticlePartB +{ + math::Vector4 m_params[ 3 ]; +}; + + +struct SimulationConstantBuffer +{ + math::Vector4 m_StartColor[ NUM_EMITTERS ] = {}; + math::Vector4 m_EndColor[ NUM_EMITTERS ] = {}; + + math::Vector4 m_EmitterLightingCenter[ NUM_EMITTERS ] = {}; + + math::Matrix4 m_ViewProjection = {}; + math::Matrix4 m_View = {}; + math::Matrix4 m_ViewInv = {}; + math::Matrix4 m_ProjectionInv = {}; + + math::Vector4 m_EyePosition = {}; + math::Vector4 m_SunDirection = {}; + + UINT m_ScreenWidth = 0; + UINT m_ScreenHeight = 0; + float m_ElapsedTime = 0.0f; + float m_CollisionThickness = 4.0f; + + int m_CollideParticles = 0; + int m_ShowSleepingParticles = 0; + int m_EnableSleepState = 0; + float m_FrameTime = 0.0f; + + int m_MaxParticles = 0; + UINT m_pad01 = 0; + UINT m_pad02 = 0; + UINT m_pad03 = 0; +}; + +struct EmitterConstantBuffer +{ + math::Vector4 m_EmitterPosition = {}; + math::Vector4 m_EmitterVelocity = {}; + math::Vector4 m_PositionVariance = {}; + + int m_MaxParticlesThisFrame = 0; + float m_ParticleLifeSpan = 0.0f; + float m_StartSize = 0.0f; + float m_EndSize = 0.0f; + + float m_VelocityVariance = 0.0f; + float m_Mass = 0.0f; + int m_Index = 0; + int m_Streaks = 0; + + int m_TextureIndex = 0; + int m_pads[ 3 ] = {}; +}; + + +// The rasterization path constant buffer +struct RenderingConstantBuffer +{ + math::Matrix4 m_Projection = {}; + math::Matrix4 m_ProjectionInv = {}; + math::Vector4 m_SunColor = {}; + math::Vector4 m_AmbientColor = {}; + math::Vector4 m_SunDirectionVS = {}; + UINT m_ScreenWidth = 0; + UINT m_ScreenHeight = 0; + UINT m_pads[ 2 ] = {}; +}; + +struct CullingConstantBuffer +{ + math::Matrix4 m_ProjectionInv = {}; + math::Matrix4 m_Projection = {}; + + UINT m_ScreenWidth = 0; + UINT m_ScreenHeight = 0; + UINT m_NumTilesX = 0; + UINT m_NumCoarseCullingTilesX = 0; + + UINT m_NumCullingTilesPerCoarseTileX = 0; + UINT m_NumCullingTilesPerCoarseTileY = 0; + UINT m_pad01 = 0; + UINT m_pad02 = 0; +}; + +struct TiledRenderingConstantBuffer +{ + math::Matrix4 m_ProjectionInv = {}; + math::Vector4 m_SunColor = {}; + math::Vector4 m_AmbientColor = {}; + math::Vector4 m_SunDirectionVS = {}; + + UINT m_ScreenHeight = 0; + float m_InvScreenWidth = 0.0f; + float m_InvScreenHeight = 0.0f; + float m_AlphaThreshold = 0.97f; + + UINT m_NumTilesX = 0; + UINT m_NumCoarseCullingTilesX = 0; + UINT m_NumCullingTilesPerCoarseTileX = 0; + UINT m_NumCullingTilesPerCoarseTileY = 0; + + UINT m_AlignedScreenWidth = 0; + UINT m_pads[ 3 ] = {}; +}; + +struct QuadConstantBuffer +{ + UINT m_AlignedScreenWidth; + UINT m_pads[ 3 ]; +}; + +// The maximum number of supported GPU particles +static const int g_maxParticles = 400*1024; diff --git a/src/GpuParticles/dx12/GPUParticleSystem.cpp b/src/GpuParticles/dx12/GPUParticleSystem.cpp new file mode 100644 index 0000000..da9f0cc --- /dev/null +++ b/src/GpuParticles/dx12/GPUParticleSystem.cpp @@ -0,0 +1,745 @@ +// +// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +#include "../DX12/stdafx.h" +#include "../ParticleSystem.h" +#include "../ParticleSystemInternal.h" +#include "../ParticleHelpers.h" +#include "ParallelSort.h" + + +const D3D12_RESOURCE_STATES SHADER_READ_STATE = D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER|D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE|D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE; + + +#pragma warning( disable : 4100 ) // disable unreference formal parameter warnings for /W4 builds + +struct IndirectCommand +{ + D3D12_GPU_VIRTUAL_ADDRESS uav = {}; + D3D12_DRAW_INDEXED_ARGUMENTS drawArguments = {}; +}; + +// GPU Particle System class. Responsible for updating and rendering the particles +class GPUParticleSystem : public IParticleSystem +{ +public: + + GPUParticleSystem( const char* particleAtlas ); + +private: + + enum DepthCullingMode + { + DepthCullingOn, + DepthCullingOff, + NumDepthCullingModes + }; + + enum StreakMode + { + StreaksOn, + StreaksOff, + NumStreakModes + }; + + enum ReactiveMode + { + ReactiveOn, + ReactiveOff, + NumReactiveModes + }; + + virtual ~GPUParticleSystem(); + + virtual void OnCreateDevice( Device &device, UploadHeap& uploadHeap, ResourceViewHeaps& heaps, StaticBufferPool& bufferPool, DynamicBufferRing& constantBufferRing ); + virtual void OnResizedSwapChain( int width, int height, Texture& depthBuffer ); + virtual void OnReleasingSwapChain(); + virtual void OnDestroyDevice(); + + virtual void Reset(); + + virtual void Render( ID3D12GraphicsCommandList* pCommandList, DynamicBufferRing& constantBufferRing, int flags, const EmitterParams* pEmitters, int nNumEmitters, const ConstantData& constantData ); + + void Emit( ID3D12GraphicsCommandList* pCommandList, DynamicBufferRing& constantBufferRing, int numEmitters, const EmitterParams* emitters ); + void Simulate( ID3D12GraphicsCommandList* pCommandList ); + void Sort( ID3D12GraphicsCommandList* pCommandList ); + + void FillRandomTexture( UploadHeap& uploadHeap ); + + void CreateSimulationAssets(); + void CreateRasterizedRenderingAssets(); + + Device* m_pDevice = nullptr; + ResourceViewHeaps* m_heaps = nullptr; + const char* m_AtlasPath = nullptr; + + Texture m_Atlas = {}; + Texture m_ParticleBufferA = {}; + Texture m_ParticleBufferB = {}; + Texture m_PackedViewSpaceParticlePositions = {}; + Texture m_MaxRadiusBuffer = {}; + Texture m_DeadListBuffer = {}; + Texture m_AliveIndexBuffer = {}; + Texture m_AliveDistanceBuffer = {}; + Texture m_AliveCountBuffer = {}; + Texture m_RenderingBuffer = {}; + Texture m_IndirectArgsBuffer = {}; + Texture m_RandomTexture = {}; + + const int m_SimulationUAVDescriptorTableCount = 9; + CBV_SRV_UAV m_SimulationUAVDescriptorTable = {}; + + const int m_SimulationSRVDescriptorTableCount = 2; + CBV_SRV_UAV m_SimulationSRVDescriptorTable = {}; + + const int m_RasterizationSRVDescriptorTableCount = 6; + CBV_SRV_UAV m_RasterizationSRVDescriptorTable = {}; + + UINT m_ScreenWidth = 0; + UINT m_ScreenHeight = 0; + float m_InvScreenWidth = 0.0f; + float m_InvScreenHeight = 0.0f; + float m_ElapsedTime = 0.0f; + float m_AlphaThreshold = 0.97f; + + D3D12_INDEX_BUFFER_VIEW m_IndexBuffer = {}; + ID3D12RootSignature* m_pSimulationRootSignature = nullptr; + ID3D12RootSignature* m_pRasterizationRootSignature = nullptr; + + ID3D12PipelineState* m_pSimulatePipeline = nullptr; + ID3D12PipelineState* m_pEmitPipeline = nullptr; + ID3D12PipelineState* m_pResetParticlesPipeline = nullptr; + ID3D12PipelineState* m_pRasterizationPipelines[ NumStreakModes ][ NumReactiveModes ] = {}; + + ID3D12CommandSignature* m_commandSignature = nullptr; + + bool m_ResetSystem = true; + FFXParallelSort m_SortLib = {}; + + D3D12_RESOURCE_STATES m_ReadBufferStates; + D3D12_RESOURCE_STATES m_WriteBufferStates; + D3D12_RESOURCE_STATES m_StridedBufferStates; +}; + +IParticleSystem* IParticleSystem::CreateGPUSystem( const char* particleAtlas ) +{ + return new GPUParticleSystem( particleAtlas ); +} + + +GPUParticleSystem::GPUParticleSystem( const char* particleAtlas ) : m_AtlasPath( particleAtlas ) +{ +} + + +GPUParticleSystem::~GPUParticleSystem() +{ +} + + +void GPUParticleSystem::Sort( ID3D12GraphicsCommandList* pCommandList ) +{ + // Causes the debug layer to lock up + m_SortLib.Draw( pCommandList ); +} + + +void GPUParticleSystem::Reset() +{ + m_ResetSystem = true; +} + +void GPUParticleSystem::Render( ID3D12GraphicsCommandList* pCommandList, DynamicBufferRing& constantBufferRing, int flags, const EmitterParams* pEmitters, int nNumEmitters, const ConstantData& constantData ) +{ + std::vector barriersBeforeSimulation; + if(m_WriteBufferStates == D3D12_RESOURCE_STATE_COMMON) + { + barriersBeforeSimulation.push_back(CD3DX12_RESOURCE_BARRIER::Transition(m_ParticleBufferB.GetResource(), m_WriteBufferStates, D3D12_RESOURCE_STATE_UNORDERED_ACCESS)); + barriersBeforeSimulation.push_back(CD3DX12_RESOURCE_BARRIER::Transition(m_DeadListBuffer.GetResource(), m_WriteBufferStates, D3D12_RESOURCE_STATE_UNORDERED_ACCESS)); + barriersBeforeSimulation.push_back(CD3DX12_RESOURCE_BARRIER::Transition(m_AliveDistanceBuffer.GetResource(), m_WriteBufferStates, D3D12_RESOURCE_STATE_UNORDERED_ACCESS)); + barriersBeforeSimulation.push_back(CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectArgsBuffer.GetResource(), m_WriteBufferStates, D3D12_RESOURCE_STATE_UNORDERED_ACCESS)); + m_WriteBufferStates = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + } + + ID3D12DescriptorHeap* descriptorHeaps[] = { m_heaps->GetCBV_SRV_UAVHeap(), m_heaps->GetSamplerHeap() }; + pCommandList->SetDescriptorHeaps( _countof( descriptorHeaps ), descriptorHeaps ); + + SimulationConstantBuffer simulationConstants = {}; + + memcpy( simulationConstants.m_StartColor, constantData.m_StartColor, sizeof( simulationConstants.m_StartColor ) ); + memcpy( simulationConstants.m_EndColor, constantData.m_EndColor, sizeof( simulationConstants.m_EndColor ) ); + memcpy( simulationConstants.m_EmitterLightingCenter, constantData.m_EmitterLightingCenter, sizeof( simulationConstants.m_EmitterLightingCenter ) ); + + simulationConstants.m_ViewProjection = constantData.m_ViewProjection; + simulationConstants.m_View = constantData.m_View; + simulationConstants.m_ViewInv = constantData.m_ViewInv; + simulationConstants.m_ProjectionInv = constantData.m_ProjectionInv; + + simulationConstants.m_EyePosition = constantData.m_ViewInv.getCol3(); + simulationConstants.m_SunDirection = constantData.m_SunDirection; + + simulationConstants.m_ScreenWidth = m_ScreenWidth; + simulationConstants.m_ScreenHeight = m_ScreenHeight; + simulationConstants.m_MaxParticles = g_maxParticles; + simulationConstants.m_FrameTime = constantData.m_FrameTime; + + math::Vector4 sunDirectionVS = constantData.m_View * constantData.m_SunDirection; + + m_ElapsedTime += constantData.m_FrameTime; + if ( m_ElapsedTime > 10.0f ) + m_ElapsedTime -= 10.0f; + + simulationConstants.m_ElapsedTime = m_ElapsedTime; + + { + UserMarker marker( pCommandList, "simulation" ); + + void* data = nullptr; + D3D12_GPU_VIRTUAL_ADDRESS constantBuffer; + constantBufferRing.AllocConstantBuffer( sizeof( simulationConstants ), &data, &constantBuffer ); + memcpy( data, &simulationConstants, sizeof( simulationConstants ) ); + + + pCommandList->SetComputeRootSignature( m_pSimulationRootSignature ); + pCommandList->SetComputeRootDescriptorTable( 0, m_SimulationUAVDescriptorTable.GetGPU() ); + pCommandList->SetComputeRootDescriptorTable( 1, m_SimulationSRVDescriptorTable.GetGPU() ); + pCommandList->SetComputeRootConstantBufferView( 2, constantBuffer ); + + barriersBeforeSimulation.push_back( CD3DX12_RESOURCE_BARRIER::Transition( m_ParticleBufferA.GetResource(), m_ReadBufferStates, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ) ); + barriersBeforeSimulation.push_back( CD3DX12_RESOURCE_BARRIER::Transition( m_PackedViewSpaceParticlePositions.GetResource(), m_ReadBufferStates, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ) ); + barriersBeforeSimulation.push_back( CD3DX12_RESOURCE_BARRIER::Transition( m_MaxRadiusBuffer.GetResource(), m_ReadBufferStates, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ) ); + barriersBeforeSimulation.push_back( CD3DX12_RESOURCE_BARRIER::Transition( m_AliveIndexBuffer.GetResource(), m_ReadBufferStates, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ) ); + barriersBeforeSimulation.push_back( CD3DX12_RESOURCE_BARRIER::Transition( m_AliveCountBuffer.GetResource(), m_ReadBufferStates, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ) ); + pCommandList->ResourceBarrier( (UINT)barriersBeforeSimulation.size(), &barriersBeforeSimulation[ 0 ] ); + m_ReadBufferStates = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + + // If we are resetting the particle system, then initialize the dead list + if ( m_ResetSystem ) + { + pCommandList->SetPipelineState( m_pResetParticlesPipeline ); + + pCommandList->Dispatch( align( g_maxParticles, 256 ) / 256, 1, 1 ); + + std::vector barriersPostReset; + barriersPostReset.push_back( CD3DX12_RESOURCE_BARRIER::UAV( m_ParticleBufferA.GetResource() ) ); + barriersPostReset.push_back( CD3DX12_RESOURCE_BARRIER::UAV( m_ParticleBufferB.GetResource() ) ); + barriersPostReset.push_back( CD3DX12_RESOURCE_BARRIER::UAV( m_DeadListBuffer.GetResource() ) ); + pCommandList->ResourceBarrier( (UINT)barriersPostReset.size(), &barriersPostReset[ 0 ] ); + + m_ResetSystem = false; + } + + // Emit particles into the system + Emit( pCommandList, constantBufferRing, nNumEmitters, pEmitters ); + + // Run the simulation for this frame + Simulate( pCommandList ); + + + + std::vector barriersAfterSimulation; + barriersAfterSimulation.push_back( CD3DX12_RESOURCE_BARRIER::Transition( m_ParticleBufferA.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, SHADER_READ_STATE) ); + barriersAfterSimulation.push_back( CD3DX12_RESOURCE_BARRIER::Transition( m_PackedViewSpaceParticlePositions.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, SHADER_READ_STATE) ); + barriersAfterSimulation.push_back( CD3DX12_RESOURCE_BARRIER::Transition( m_MaxRadiusBuffer.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, SHADER_READ_STATE) ); + barriersAfterSimulation.push_back( CD3DX12_RESOURCE_BARRIER::Transition( m_AliveCountBuffer.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, SHADER_READ_STATE) ); + barriersAfterSimulation.push_back( CD3DX12_RESOURCE_BARRIER::UAV( m_DeadListBuffer.GetResource() ) ); + pCommandList->ResourceBarrier( (UINT)barriersAfterSimulation.size(), &barriersAfterSimulation[ 0 ] ); + } + + // Conventional rasterization path + { + UserMarker marker( pCommandList, "rasterization" ); + + // Sort if requested. Not doing so results in the particles rendering out of order and not blending correctly + if ( flags & PF_Sort ) + { + UserMarker marker( pCommandList, "sorting" ); + + const D3D12_RESOURCE_BARRIER barriers[] = + { + CD3DX12_RESOURCE_BARRIER::UAV( m_AliveIndexBuffer.GetResource() ), + CD3DX12_RESOURCE_BARRIER::UAV( m_AliveDistanceBuffer.GetResource() ), + }; + pCommandList->ResourceBarrier( _countof( barriers ), barriers ); + + Sort( pCommandList ); + } + + StreakMode streaks = flags & PF_Streaks ? StreaksOn : StreaksOff; + ReactiveMode reactive = flags & PF_Reactive ? ReactiveOn : ReactiveOff; + + RenderingConstantBuffer* cb = nullptr; + D3D12_GPU_VIRTUAL_ADDRESS renderingConstantBuffer; + constantBufferRing.AllocConstantBuffer( sizeof( RenderingConstantBuffer ), (void**)&cb, &renderingConstantBuffer ); + cb->m_Projection = constantData.m_Projection; + cb->m_ProjectionInv = simulationConstants.m_ProjectionInv; + cb->m_SunColor = constantData.m_SunColor; + cb->m_AmbientColor = constantData.m_AmbientColor; + cb->m_SunDirectionVS = sunDirectionVS; + cb->m_ScreenWidth = m_ScreenWidth; + cb->m_ScreenHeight = m_ScreenHeight; + + pCommandList->SetGraphicsRootSignature( m_pRasterizationRootSignature ); + pCommandList->SetGraphicsRootDescriptorTable( 0, m_RasterizationSRVDescriptorTable.GetGPU() ); + pCommandList->SetGraphicsRootConstantBufferView( 1, renderingConstantBuffer ); + pCommandList->SetGraphicsRootUnorderedAccessView( 2, m_IndirectArgsBuffer.GetResource()->GetGPUVirtualAddress() ); + pCommandList->SetPipelineState( m_pRasterizationPipelines[ streaks ][ reactive ] ); + + pCommandList->IASetIndexBuffer( &m_IndexBuffer ); + pCommandList->IASetVertexBuffers( 0, 0, nullptr ); + pCommandList->IASetPrimitiveTopology( D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST ); + + std::vector barriers; + barriers.push_back( CD3DX12_RESOURCE_BARRIER::Transition( m_AliveIndexBuffer.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, SHADER_READ_STATE) ); + barriers.push_back( CD3DX12_RESOURCE_BARRIER::Transition( m_IndirectArgsBuffer.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT ) ); + pCommandList->ResourceBarrier( (UINT)barriers.size(), &barriers[ 0 ] ); + + pCommandList->ExecuteIndirect( m_commandSignature, 1, m_IndirectArgsBuffer.GetResource(), 0, nullptr, 0 ); + + pCommandList->ResourceBarrier( 1, &CD3DX12_RESOURCE_BARRIER::Transition( m_IndirectArgsBuffer.GetResource(), D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT, D3D12_RESOURCE_STATE_UNORDERED_ACCESS ) ); + } + + m_ReadBufferStates = SHADER_READ_STATE; +} + + +void GPUParticleSystem::OnCreateDevice(Device &device, UploadHeap& uploadHeap, ResourceViewHeaps& heaps, StaticBufferPool& bufferPool, DynamicBufferRing& constantBufferRing ) +{ + m_pDevice = &device; + m_heaps = &heaps; + + m_ReadBufferStates = D3D12_RESOURCE_STATE_COMMON; + m_WriteBufferStates = D3D12_RESOURCE_STATE_COMMON; // D3D12_RESOURCE_STATE_UNORDERED_ACCESS + m_StridedBufferStates = D3D12_RESOURCE_STATE_COMMON; + + // Create the global particle pool. Each particle is split into two parts for better cache coherency. The first half contains the data more + // relevant to rendering while the second half is more related to simulation + CD3DX12_RESOURCE_DESC RDescParticlesA = CD3DX12_RESOURCE_DESC::Buffer( sizeof( GPUParticlePartA ) * g_maxParticles, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS ); + m_ParticleBufferA.InitBuffer(&device, "ParticleBufferA", &RDescParticlesA, sizeof( GPUParticlePartA ), m_ReadBufferStates); + + CD3DX12_RESOURCE_DESC RDescParticlesB = CD3DX12_RESOURCE_DESC::Buffer( sizeof( GPUParticlePartB ) * g_maxParticles, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS ); + m_ParticleBufferB.InitBuffer(&device, "ParticleBufferB", &RDescParticlesB, sizeof( GPUParticlePartB ), m_WriteBufferStates); + + // The packed view space positions of particles are cached during simulation so allocate a buffer for them + CD3DX12_RESOURCE_DESC RDescPackedViewSpaceParticlePositions = CD3DX12_RESOURCE_DESC::Buffer( 8 * g_maxParticles, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS ); + m_PackedViewSpaceParticlePositions.InitBuffer(&device, "PackedViewSpaceParticlePositions", &RDescPackedViewSpaceParticlePositions, 8, m_ReadBufferStates); + + // The maximum radii of each particle is cached during simulation to avoid recomputing multiple times later. This is only required + // for streaked particles as they are not round so we cache the max radius of X and Y + CD3DX12_RESOURCE_DESC RDescMaxRadiusBuffer = CD3DX12_RESOURCE_DESC::Buffer( sizeof( float ) * g_maxParticles, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS ); + m_MaxRadiusBuffer.InitBuffer(&device, "MaxRadiusBuffer", &RDescMaxRadiusBuffer, sizeof( float ), m_ReadBufferStates); + + // The dead particle index list. Created as an append buffer + CD3DX12_RESOURCE_DESC RDescDeadListBuffer = CD3DX12_RESOURCE_DESC::Buffer( sizeof( INT ) * ( g_maxParticles + 1 ), D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS ); + m_DeadListBuffer.InitBuffer(&device, "DeadListBuffer", &RDescDeadListBuffer, sizeof( INT ), m_WriteBufferStates); + + // Create the index buffer of alive particles that is to be sorted (at least in the rasterization path). + // For the tiled rendering path this could be just a UINT index buffer as particles are not globally sorted + CD3DX12_RESOURCE_DESC RDescAliveIndexBuffer = CD3DX12_RESOURCE_DESC::Buffer( sizeof( int ) * g_maxParticles, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS ); + m_AliveIndexBuffer.InitBuffer(&device, "AliveIndexBuffer", &RDescAliveIndexBuffer, sizeof( int ), m_ReadBufferStates); + + CD3DX12_RESOURCE_DESC RDescAliveDistanceBuffer = CD3DX12_RESOURCE_DESC::Buffer( sizeof( float ) * g_maxParticles, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS ); + m_AliveDistanceBuffer.InitBuffer(&device, "AliveDistanceBuffer", &RDescAliveDistanceBuffer, sizeof( float ), m_WriteBufferStates); + + // Create the single element buffer which is used to store the count of alive particles + CD3DX12_RESOURCE_DESC RDescAliveCountBuffer = CD3DX12_RESOURCE_DESC::Buffer( sizeof( UINT ), D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS ); + m_AliveCountBuffer.InitBuffer(&device, "AliveCountBuffer", &RDescAliveCountBuffer, sizeof( UINT ), m_ReadBufferStates); + + + // Create the buffer to store the indirect args for the ExecuteIndirect call + // Create the index buffer of alive particles that is to be sorted (at least in the rasterization path). + CD3DX12_RESOURCE_DESC desc = CD3DX12_RESOURCE_DESC::Buffer( sizeof( IndirectCommand ), D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS ); + m_IndirectArgsBuffer.InitBuffer(&device, "IndirectArgsBuffer", &desc, sizeof( IndirectCommand ), m_WriteBufferStates); + + // Create the particle billboard index buffer required for the rasterization VS-only path + UINT* indices = new UINT[ g_maxParticles * 6 ]; + UINT* ptr = indices; + UINT base = 0; + for ( int i = 0; i < g_maxParticles; i++ ) + { + ptr[ 0 ] = base + 0; + ptr[ 1 ] = base + 1; + ptr[ 2 ] = base + 2; + + ptr[ 3 ] = base + 2; + ptr[ 4 ] = base + 1; + ptr[ 5 ] = base + 3; + + base += 4; + ptr += 6; + } + + bufferPool.AllocIndexBuffer( g_maxParticles * 6, sizeof( UINT ), indices, &m_IndexBuffer ); + delete[] indices; + + // Initialize the random numbers texture + FillRandomTexture( uploadHeap ); + + m_Atlas.InitFromFile( &device, &uploadHeap, m_AtlasPath, true ); + + CreateSimulationAssets(); + CreateRasterizedRenderingAssets(); + + // Create the SortLib resources + m_SortLib.OnCreate( m_pDevice, m_heaps, &constantBufferRing, &uploadHeap, &m_AliveCountBuffer, &m_AliveDistanceBuffer, &m_AliveIndexBuffer ); +} + +void GPUParticleSystem::CreateSimulationAssets() +{ + m_heaps->AllocCBV_SRV_UAVDescriptor( m_SimulationUAVDescriptorTableCount, &m_SimulationUAVDescriptorTable ); + + m_ParticleBufferA.CreateBufferUAV( 0, nullptr, &m_SimulationUAVDescriptorTable ); + m_ParticleBufferB.CreateBufferUAV( 1, nullptr, &m_SimulationUAVDescriptorTable ); + m_DeadListBuffer.CreateBufferUAV( 2, nullptr, &m_SimulationUAVDescriptorTable ); + m_AliveIndexBuffer.CreateBufferUAV( 3, nullptr, &m_SimulationUAVDescriptorTable ); + m_AliveDistanceBuffer.CreateBufferUAV( 4, nullptr, &m_SimulationUAVDescriptorTable ); + m_MaxRadiusBuffer.CreateBufferUAV( 5, nullptr, &m_SimulationUAVDescriptorTable ); + m_PackedViewSpaceParticlePositions.CreateBufferUAV( 6, nullptr, &m_SimulationUAVDescriptorTable ); + m_IndirectArgsBuffer.CreateBufferUAV( 7, nullptr, &m_SimulationUAVDescriptorTable ); + m_AliveCountBuffer.CreateBufferUAV( 8, nullptr, &m_SimulationUAVDescriptorTable ); + + m_heaps->AllocCBV_SRV_UAVDescriptor( m_SimulationSRVDescriptorTableCount, &m_SimulationSRVDescriptorTable ); + // depth buffer // t0 + m_RandomTexture.CreateSRV( 1, &m_SimulationSRVDescriptorTable ); // t1 + + { + CD3DX12_DESCRIPTOR_RANGE DescRange[2] = {}; + DescRange[0].Init( D3D12_DESCRIPTOR_RANGE_TYPE_UAV, m_SimulationUAVDescriptorTableCount, 0 ); // u0 - u8 + DescRange[1].Init( D3D12_DESCRIPTOR_RANGE_TYPE_SRV, m_SimulationSRVDescriptorTableCount, 0 ); // t0 - t1 + + CD3DX12_ROOT_PARAMETER rootParamters[4] = {}; + rootParamters[0].InitAsDescriptorTable( 1, &DescRange[0], D3D12_SHADER_VISIBILITY_ALL ); // uavs + rootParamters[1].InitAsDescriptorTable( 1, &DescRange[1], D3D12_SHADER_VISIBILITY_ALL ); // textures + rootParamters[2].InitAsConstantBufferView( 0 ); // b0 - per frame + rootParamters[3].InitAsConstantBufferView( 1 ); // b1 - per emitter + + CD3DX12_STATIC_SAMPLER_DESC sampler( 0, D3D12_FILTER_MIN_MAG_MIP_POINT, D3D12_TEXTURE_ADDRESS_MODE_WRAP, D3D12_TEXTURE_ADDRESS_MODE_WRAP, D3D12_TEXTURE_ADDRESS_MODE_CLAMP ); + + CD3DX12_ROOT_SIGNATURE_DESC descRootSignature = {}; + descRootSignature.Init( _countof( rootParamters ), rootParamters, 1, &sampler ); + + ID3DBlob *pOutBlob, *pErrorBlob = nullptr; + D3D12SerializeRootSignature( &descRootSignature, D3D_ROOT_SIGNATURE_VERSION_1, &pOutBlob, &pErrorBlob ); + m_pDevice->GetDevice()->CreateRootSignature( 0, pOutBlob->GetBufferPointer(), pOutBlob->GetBufferSize(), IID_PPV_ARGS( &m_pSimulationRootSignature ) ); + m_pSimulationRootSignature->SetName( L"SimulationRootSignature" ); + + pOutBlob->Release(); + if (pErrorBlob) + pErrorBlob->Release(); + } + + D3D12_COMPUTE_PIPELINE_STATE_DESC descPso = {}; + descPso.Flags = D3D12_PIPELINE_STATE_FLAG_NONE; + descPso.pRootSignature = m_pSimulationRootSignature; + descPso.NodeMask = 0; + + DefineList defines; + defines["API_DX12"] = ""; + + { + D3D12_SHADER_BYTECODE computeShader; + CompileShaderFromFile( "ParticleSimulation.hlsl", &defines, "CS_Reset", "-T cs_6_0", &computeShader ); + + descPso.CS = computeShader; + m_pDevice->GetDevice()->CreateComputePipelineState( &descPso, IID_PPV_ARGS( &m_pResetParticlesPipeline ) ); + m_pResetParticlesPipeline->SetName( L"ResetParticles" ); + } + + { + D3D12_SHADER_BYTECODE computeShader; + CompileShaderFromFile( "ParticleSimulation.hlsl", &defines, "CS_Simulate", "-T cs_6_0", &computeShader ); + + descPso.CS = computeShader; + m_pDevice->GetDevice()->CreateComputePipelineState( &descPso, IID_PPV_ARGS( &m_pSimulatePipeline ) ); + m_pSimulatePipeline->SetName( L"Simulation" ); + } + + { + D3D12_SHADER_BYTECODE computeShader; + CompileShaderFromFile( "ParticleEmit.hlsl", &defines, "CS_Emit", "-T cs_6_0", &computeShader ); + + descPso.CS = computeShader; + m_pDevice->GetDevice()->CreateComputePipelineState( &descPso, IID_PPV_ARGS( &m_pEmitPipeline ) ); + m_pEmitPipeline->SetName( L"Emit" ); + } +} + + +void GPUParticleSystem::CreateRasterizedRenderingAssets() +{ + m_heaps->AllocCBV_SRV_UAVDescriptor( m_RasterizationSRVDescriptorTableCount, &m_RasterizationSRVDescriptorTable ); + m_ParticleBufferA.CreateSRV( 0, &m_RasterizationSRVDescriptorTable ); + m_PackedViewSpaceParticlePositions.CreateSRV( 1, &m_RasterizationSRVDescriptorTable ); + m_AliveCountBuffer.CreateSRV( 2, &m_RasterizationSRVDescriptorTable ); + m_AliveIndexBuffer.CreateSRV( 3, &m_RasterizationSRVDescriptorTable ); + m_Atlas.CreateSRV( 4, &m_RasterizationSRVDescriptorTable ); + // depth texture t5 + + { + CD3DX12_DESCRIPTOR_RANGE DescRange[1] = {}; + DescRange[0].Init( D3D12_DESCRIPTOR_RANGE_TYPE_SRV, m_RasterizationSRVDescriptorTableCount, 0 ); // t0-t5 + + CD3DX12_ROOT_PARAMETER rootParamters[3] = {}; + rootParamters[0].InitAsDescriptorTable( 1, &DescRange[0], D3D12_SHADER_VISIBILITY_ALL ); // textures + rootParamters[1].InitAsConstantBufferView( 0 ); // b0 + rootParamters[2].InitAsUnorderedAccessView( 0 ); + + CD3DX12_STATIC_SAMPLER_DESC sampler( 0, D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT, D3D12_TEXTURE_ADDRESS_MODE_CLAMP, D3D12_TEXTURE_ADDRESS_MODE_CLAMP, D3D12_TEXTURE_ADDRESS_MODE_CLAMP ); + + CD3DX12_ROOT_SIGNATURE_DESC descRootSignature = {}; + descRootSignature.Init( _countof( rootParamters ), rootParamters, 1, &sampler ); + + ID3DBlob *pOutBlob, *pErrorBlob = nullptr; + D3D12SerializeRootSignature( &descRootSignature, D3D_ROOT_SIGNATURE_VERSION_1, &pOutBlob, &pErrorBlob ); + m_pDevice->GetDevice()->CreateRootSignature( 0, pOutBlob->GetBufferPointer(), pOutBlob->GetBufferSize(), IID_PPV_ARGS( &m_pRasterizationRootSignature ) ); + m_pRasterizationRootSignature->SetName( L"RasterizationRootSignature" ); + + pOutBlob->Release(); + if (pErrorBlob) + pErrorBlob->Release(); + } + + D3D12_GRAPHICS_PIPELINE_STATE_DESC descPso = {}; + descPso.InputLayout = { nullptr, 0 }; + descPso.pRootSignature = m_pRasterizationRootSignature; + + descPso.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT); + descPso.RasterizerState.CullMode = D3D12_CULL_MODE_NONE; + descPso.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT); + descPso.BlendState.IndependentBlendEnable = true; + descPso.BlendState.RenderTarget[0].BlendEnable = true; + descPso.BlendState.RenderTarget[2].BlendEnable = true; + + descPso.BlendState.RenderTarget[0].SrcBlend = D3D12_BLEND_SRC_ALPHA; + descPso.BlendState.RenderTarget[0].DestBlend = D3D12_BLEND_INV_SRC_ALPHA; + descPso.BlendState.RenderTarget[0].BlendOp = D3D12_BLEND_OP_ADD; + descPso.BlendState.RenderTarget[0].SrcBlendAlpha = D3D12_BLEND_INV_SRC_ALPHA; + descPso.BlendState.RenderTarget[0].DestBlendAlpha = D3D12_BLEND_ZERO; + descPso.BlendState.RenderTarget[0].BlendOpAlpha = D3D12_BLEND_OP_ADD; + + descPso.BlendState.RenderTarget[0].RenderTargetWriteMask = D3D12_COLOR_WRITE_ENABLE_ALL; + descPso.BlendState.RenderTarget[1].RenderTargetWriteMask = 0; + descPso.BlendState.RenderTarget[2].RenderTargetWriteMask = 0; + descPso.BlendState.RenderTarget[3].RenderTargetWriteMask = 0; + + descPso.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC(D3D12_DEFAULT); + descPso.DepthStencilState.DepthEnable = TRUE; + descPso.DepthStencilState.DepthWriteMask = D3D12_DEPTH_WRITE_MASK_ZERO; + descPso.DepthStencilState.DepthFunc = D3D12_COMPARISON_FUNC_GREATER_EQUAL; + descPso.SampleMask = UINT_MAX; + descPso.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + descPso.NumRenderTargets = 4; + descPso.RTVFormats[0] = DXGI_FORMAT_R16G16B16A16_FLOAT; + descPso.RTVFormats[1] = DXGI_FORMAT_R16G16_FLOAT; + descPso.RTVFormats[2] = DXGI_FORMAT_R8_UNORM; + descPso.RTVFormats[3] = DXGI_FORMAT_R8_UNORM; + descPso.DSVFormat = DXGI_FORMAT_D32_FLOAT; + descPso.SampleDesc.Count = 1; + descPso.NodeMask = 0; + + for ( int i = 0; i < NumStreakModes; i++ ) + { + for ( int j = 0; j < NumReactiveModes; j++ ) + { + descPso.BlendState.RenderTarget[2].RenderTargetWriteMask = 0; + + DefineList defines; + defines["API_DX12"] = ""; + if ( i == StreaksOn ) + defines["STREAKS"] = ""; + + if ( j == ReactiveOn ) + { + defines["REACTIVE"] = ""; + descPso.BlendState.RenderTarget[2].RenderTargetWriteMask = D3D12_COLOR_WRITE_ENABLE_RED; + } + + D3D12_SHADER_BYTECODE vertexShader = {}; + CompileShaderFromFile( "ParticleRender.hlsl", &defines, "VS_StructuredBuffer", "-T vs_6_0", &vertexShader ); + + D3D12_SHADER_BYTECODE pixelShader = {}; + CompileShaderFromFile( "ParticleRender.hlsl", &defines, "PS_Billboard", "-T ps_6_0", &pixelShader ); + + descPso.VS = vertexShader; + descPso.PS = pixelShader; + m_pDevice->GetDevice()->CreateGraphicsPipelineState( &descPso, IID_PPV_ARGS( &m_pRasterizationPipelines[ i ][ j ] ) ); + } + } + + D3D12_INDIRECT_ARGUMENT_DESC argumentDescs[2] = {}; + argumentDescs[0].Type = D3D12_INDIRECT_ARGUMENT_TYPE_UNORDERED_ACCESS_VIEW; + argumentDescs[0].UnorderedAccessView.RootParameterIndex = 2; + argumentDescs[1].Type = D3D12_INDIRECT_ARGUMENT_TYPE_DRAW_INDEXED; + + D3D12_COMMAND_SIGNATURE_DESC commandSignatureDesc = {}; + commandSignatureDesc.pArgumentDescs = argumentDescs; + commandSignatureDesc.NumArgumentDescs = _countof( argumentDescs ); + commandSignatureDesc.ByteStride = sizeof( IndirectCommand ); + + m_pDevice->GetDevice()->CreateCommandSignature( &commandSignatureDesc, m_pRasterizationRootSignature, IID_PPV_ARGS( &m_commandSignature ) ); + m_commandSignature->SetName( L"CommandSignature" ); +} + + +void GPUParticleSystem::OnResizedSwapChain( int width, int height, Texture& depthBuffer ) +{ + m_ScreenWidth = width; + m_ScreenHeight = height; + m_InvScreenWidth = 1.0f / m_ScreenWidth; + m_InvScreenHeight = 1.0f / m_ScreenHeight; + + depthBuffer.CreateSRV( 0, &m_SimulationSRVDescriptorTable ); + depthBuffer.CreateSRV( 5, &m_RasterizationSRVDescriptorTable ); +} + + +void GPUParticleSystem::OnReleasingSwapChain() +{ +} + + +void GPUParticleSystem::OnDestroyDevice() +{ + m_pDevice = nullptr; + + m_ParticleBufferA.OnDestroy(); + m_ParticleBufferB.OnDestroy(); + m_PackedViewSpaceParticlePositions.OnDestroy(); + m_MaxRadiusBuffer.OnDestroy(); + m_DeadListBuffer.OnDestroy(); + m_AliveIndexBuffer.OnDestroy(); + m_AliveDistanceBuffer.OnDestroy(); + m_AliveCountBuffer.OnDestroy(); + m_RandomTexture.OnDestroy(); + m_Atlas.OnDestroy(); + m_IndirectArgsBuffer.OnDestroy(); + + m_pSimulatePipeline->Release(); + m_pSimulatePipeline = nullptr; + + m_pResetParticlesPipeline->Release(); + m_pResetParticlesPipeline = nullptr; + + m_pEmitPipeline->Release(); + m_pEmitPipeline = nullptr; + + m_pSimulationRootSignature->Release(); + m_pSimulationRootSignature = nullptr; + + for ( int i = 0; i < NumStreakModes; i++ ) + { + for ( int j = 0; j < NumReactiveModes; j++ ) + { + m_pRasterizationPipelines[ i ][ j ]->Release(); + m_pRasterizationPipelines[ i ][ j ] = nullptr; + } + } + + m_pRasterizationRootSignature->Release(); + m_pRasterizationRootSignature = nullptr; + + m_commandSignature->Release(); + m_commandSignature = nullptr; + + m_SortLib.OnDestroy(); + + m_ResetSystem = true; +} + + +// Per-frame emission of particles into the GPU simulation +void GPUParticleSystem::Emit( ID3D12GraphicsCommandList* pCommandList, DynamicBufferRing& constantBufferRing, int numEmitters, const EmitterParams* emitters ) +{ + pCommandList->SetPipelineState( m_pEmitPipeline ); + + // Run CS for each emitter + for ( int i = 0; i < numEmitters; i++ ) + { + const EmitterParams& emitter = emitters[ i ]; + + if ( emitter.m_NumToEmit > 0 ) + { + EmitterConstantBuffer* constants = nullptr; + D3D12_GPU_VIRTUAL_ADDRESS constantBuffer; + constantBufferRing.AllocConstantBuffer( sizeof(*constants), (void**)&constants, &constantBuffer ); + constants->m_EmitterPosition = emitter.m_Position; + constants->m_EmitterVelocity = emitter.m_Velocity; + constants->m_MaxParticlesThisFrame = emitter.m_NumToEmit; + constants->m_ParticleLifeSpan = emitter.m_ParticleLifeSpan; + constants->m_StartSize = emitter.m_StartSize; + constants->m_EndSize = emitter.m_EndSize; + constants->m_PositionVariance = emitter.m_PositionVariance; + constants->m_VelocityVariance = emitter.m_VelocityVariance; + constants->m_Mass = emitter.m_Mass; + constants->m_Index = i; + constants->m_Streaks = emitter.m_Streaks ? 1 : 0; + constants->m_TextureIndex = emitter.m_TextureIndex; + pCommandList->SetComputeRootConstantBufferView( 3, constantBuffer ); + + // Dispatch enough thread groups to spawn the requested particles + int numThreadGroups = align( emitter.m_NumToEmit, 1024 ) / 1024; + pCommandList->Dispatch( numThreadGroups, 1, 1 ); + + pCommandList->ResourceBarrier( 1, &CD3DX12_RESOURCE_BARRIER::UAV( m_DeadListBuffer.GetResource() ) ); + } + } + + // RaW barriers + pCommandList->ResourceBarrier( 1, &CD3DX12_RESOURCE_BARRIER::UAV( m_ParticleBufferA.GetResource() ) ); + pCommandList->ResourceBarrier( 1, &CD3DX12_RESOURCE_BARRIER::UAV( m_ParticleBufferB.GetResource() ) ); +} + + +// Per-frame simulation step +void GPUParticleSystem::Simulate( ID3D12GraphicsCommandList* pCommandList ) +{ + pCommandList->SetPipelineState( m_pSimulatePipeline ); + pCommandList->Dispatch( align( g_maxParticles, 256 ) / 256, 1, 1 ); +} + + +// Populate a texture with random numbers (used for the emission of particles) +void GPUParticleSystem::FillRandomTexture( UploadHeap& uploadHeap ) +{ + IMG_INFO header = {}; + header.width = 1024; + header.height = 1024; + header.depth = 1; + header.arraySize = 1; + header.mipMapCount = 1; + header.format = DXGI_FORMAT_R32G32B32A32_FLOAT; + header.bitCount = 128; + + float* values = new float[ header.width * header.height * 4 ]; + float* ptr = values; + for ( UINT i = 0; i < header.width * header.height; i++ ) + { + ptr[ 0 ] = RandomVariance( 0.0f, 1.0f ); + ptr[ 1 ] = RandomVariance( 0.0f, 1.0f ); + ptr[ 2 ] = RandomVariance( 0.0f, 1.0f ); + ptr[ 3 ] = RandomVariance( 0.0f, 1.0f ); + ptr += 4; + } + + m_RandomTexture.InitFromData(m_pDevice, "RadomTexture", uploadHeap, header, values ); + + delete[] values; +} diff --git a/src/GpuParticles/dx12/ParallelSort.cpp b/src/GpuParticles/dx12/ParallelSort.cpp new file mode 100644 index 0000000..0f1a108 --- /dev/null +++ b/src/GpuParticles/dx12/ParallelSort.cpp @@ -0,0 +1,524 @@ +// ParallelSort.cpp +// +// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#define FFX_CPP +#include "ParallelSort.h" +#include "../FFX-ParallelSort/FFX_ParallelSort.h" + +static const uint32_t NumKeys = { 400*1024 }; + + +void FFXParallelSort::CompileRadixPipeline(const char* shaderFile, const DefineList* defines, const char* entryPoint, ID3D12PipelineState*& pPipeline) +{ + std::string CompileFlags("-T cs_6_0"); +#ifdef _DEBUG + CompileFlags += " -Zi -Od"; +#endif // _DEBUG + + D3D12_SHADER_BYTECODE shaderByteCode = {}; + CompileShaderFromFile(shaderFile, defines, entryPoint, CompileFlags.c_str(), &shaderByteCode); + + D3D12_COMPUTE_PIPELINE_STATE_DESC descPso = {}; + descPso.CS = shaderByteCode; + descPso.Flags = D3D12_PIPELINE_STATE_FLAG_NONE; + descPso.pRootSignature = m_pFPSRootSignature; + descPso.NodeMask = 0; + + ThrowIfFailed(m_pDevice->GetDevice()->CreateComputePipelineState(&descPso, IID_PPV_ARGS(&pPipeline))); + SetName(pPipeline, entryPoint); +} + +void FFXParallelSort::OnCreate(Device* pDevice, ResourceViewHeaps* pResourceViewHeaps, DynamicBufferRing* pConstantBufferRing, UploadHeap* pUploadHeap, Texture* elementCount, Texture* listA, Texture* listB) +{ + m_pDevice = pDevice; + m_pUploadHeap = pUploadHeap; + m_pResourceViewHeaps = pResourceViewHeaps; + m_pConstantBufferRing = pConstantBufferRing; + m_SrcKeyBuffer = listA; + m_SrcPayloadBuffer = listB; + m_MaxNumThreadgroups = 800; + + // Allocate UAVs to use for data + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_ElementCountSRV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_SrcKeyUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_SrcPayloadUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(2, &m_DstKeyUAVTable); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(2, &m_DstPayloadUAVTable); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_FPSScratchUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_FPSReducedScratchUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_IndirectKeyCountsUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_IndirectConstantBufferUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_IndirectCountScatterArgsUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_IndirectReduceScanArgsUAV); + + // The DstKey and DstPayload buffers will be used as src/dst when sorting. A copy of the + // source key/payload will be copied into them before hand so we can keep our original values + CD3DX12_RESOURCE_DESC ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * NumKeys, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_DstKeyTempBuffer[0].InitBuffer(m_pDevice, "DstKeyTempBuf0", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COMMON); + m_DstKeyTempBuffer[1].InitBuffer(m_pDevice, "DstKeyTempBuf1", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COMMON); + m_DstPayloadTempBuffer[0].InitBuffer(m_pDevice, "DstPayloadTempBuf0", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COMMON); + m_DstPayloadTempBuffer[1].InitBuffer(m_pDevice, "DstPayloadTempBuf1", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COMMON); + { + CD3DX12_RESOURCE_BARRIER Barriers[4] = + { + CD3DX12_RESOURCE_BARRIER::Transition(m_DstKeyTempBuffer[0].GetResource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS), + CD3DX12_RESOURCE_BARRIER::Transition(m_DstKeyTempBuffer[1].GetResource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS), + CD3DX12_RESOURCE_BARRIER::Transition(m_DstPayloadTempBuffer[0].GetResource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS), + CD3DX12_RESOURCE_BARRIER::Transition(m_DstPayloadTempBuffer[1].GetResource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS) + }; + m_pUploadHeap->GetCommandList()->ResourceBarrier(4, Barriers); + } + + // Create UAVs + listA->CreateBufferUAV(0, nullptr, &m_SrcKeyUAV); + listB->CreateBufferUAV(0, nullptr, &m_SrcPayloadUAV); + m_DstKeyTempBuffer[0].CreateBufferUAV(0, nullptr, &m_DstKeyUAVTable); + m_DstPayloadTempBuffer[0].CreateBufferUAV(0, nullptr, &m_DstPayloadUAVTable); + m_DstKeyTempBuffer[1].CreateBufferUAV(1, nullptr, &m_DstKeyUAVTable); + m_DstPayloadTempBuffer[1].CreateBufferUAV(1, nullptr, &m_DstPayloadUAVTable); + + elementCount->CreateSRV( 0, &m_ElementCountSRV, 0 ); + + // We are just going to fudge the indirect execution parameters for each resolution + ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t), D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_IndirectKeyCounts.InitBuffer(m_pDevice, "IndirectKeyCounts", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COMMON); + m_IndirectKeyCounts.CreateBufferUAV(0, nullptr, &m_IndirectKeyCountsUAV); + uint8_t* pNumKeysBuffer = m_pUploadHeap->Suballocate(sizeof(uint32_t), D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT); + memcpy(pNumKeysBuffer, &NumKeys, sizeof(uint32_t) ); + m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_IndirectKeyCounts.GetResource(), 0, m_pUploadHeap->GetResource(), pNumKeysBuffer - m_pUploadHeap->BasePtr(), sizeof(uint32_t)); + CD3DX12_RESOURCE_BARRIER Barrier = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectKeyCounts.GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + m_pUploadHeap->GetCommandList()->ResourceBarrier(1, &Barrier); + + // Allocate the scratch buffers needed for radix sort + uint32_t scratchBufferSize; + uint32_t reducedScratchBufferSize; + FFX_ParallelSort_CalculateScratchResourceSize(NumKeys, scratchBufferSize, reducedScratchBufferSize); + + ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(scratchBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_FPSScratchBuffer.InitBuffer(m_pDevice, "Scratch", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COMMON); + m_FPSScratchBuffer.CreateBufferUAV(0, nullptr, &m_FPSScratchUAV); + + ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(reducedScratchBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_FPSReducedScratchBuffer.InitBuffer(m_pDevice, "ReducedScratch", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COMMON); + m_FPSReducedScratchBuffer.CreateBufferUAV(0, nullptr, &m_FPSReducedScratchUAV); + + // Allocate the buffers for indirect execution of the algorithm + ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(FFX_ParallelSortCB), D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_IndirectConstantBuffer.InitBuffer(m_pDevice, "IndirectConstantBuffer", &ResourceDesc, sizeof(FFX_ParallelSortCB), D3D12_RESOURCE_STATE_COMMON); + m_IndirectConstantBuffer.CreateBufferUAV(0, nullptr, &m_IndirectConstantBufferUAV); + + ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * 3, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_IndirectCountScatterArgs.InitBuffer(m_pDevice, "IndirectCount_Scatter_DispatchArgs", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COMMON); + m_IndirectCountScatterArgs.CreateBufferUAV(0, nullptr, &m_IndirectCountScatterArgsUAV); + m_IndirectReduceScanArgs.InitBuffer(m_pDevice, "IndirectCount_Scatter_DispatchArgs", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COMMON); + m_IndirectReduceScanArgs.CreateBufferUAV(0, nullptr, &m_IndirectReduceScanArgsUAV); + + { + CD3DX12_RESOURCE_BARRIER Barriers[5] = + { + CD3DX12_RESOURCE_BARRIER::Transition(m_FPSScratchBuffer.GetResource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS), + CD3DX12_RESOURCE_BARRIER::Transition(m_FPSReducedScratchBuffer.GetResource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS), + CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectConstantBuffer.GetResource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS), + CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectCountScatterArgs.GetResource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS), + CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectReduceScanArgs.GetResource(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_UNORDERED_ACCESS) + }; + m_pUploadHeap->GetCommandList()->ResourceBarrier(5, Barriers); + } + // Create root signature for Radix sort passes + { + D3D12_DESCRIPTOR_RANGE descRange[16]; + D3D12_ROOT_PARAMETER rootParams[17]; + + // Constant buffer table (always have 1) + descRange[0] = { D3D12_DESCRIPTOR_RANGE_TYPE_CBV, 1, 0, 0, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; rootParams[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[0].Descriptor = { descRange[0].BaseShaderRegister, descRange[0].RegisterSpace }; + + // Constant buffer to setup indirect params (indirect) + descRange[1] = { D3D12_DESCRIPTOR_RANGE_TYPE_CBV, 1, 1, 0, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[1].ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; rootParams[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[1].Descriptor = { descRange[1].BaseShaderRegister, descRange[1].RegisterSpace }; + + rootParams[2].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; rootParams[2].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[2].Constants = { 2, 0, 1 }; + + // SrcBuffer (sort or scan) + descRange[2] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[3].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[3].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[3].DescriptorTable = { 1, &descRange[2] }; + + // ScrPayload (sort only) + descRange[3] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 1, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[4].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[4].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[4].DescriptorTable = { 1, &descRange[3] }; + + // Scratch (sort only) + descRange[4] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 2, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[5].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[5].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[5].DescriptorTable = { 1, &descRange[4] }; + + // Scratch (reduced) + descRange[5] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 3, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[6].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[6].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[6].DescriptorTable = { 1, &descRange[5] }; + + // DstBuffer (sort or scan) + descRange[6] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 4, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[7].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[7].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[7].DescriptorTable = { 1, &descRange[6] }; + + // DstPayload (sort only) + descRange[7] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 5, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[8].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[8].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[8].DescriptorTable = { 1, &descRange[7] }; + + // ScanSrc + descRange[8] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 6, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[9].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[9].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[9].DescriptorTable = { 1, &descRange[8] }; + + // ScanDst + descRange[9] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 7, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[10].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[10].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[10].DescriptorTable = { 1, &descRange[9] }; + + // ScanScratch + descRange[10] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 8, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[11].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[11].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[11].DescriptorTable = { 1, &descRange[10] }; + + // NumKeys (indirect) + descRange[11] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 9, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[12].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[12].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[12].DescriptorTable = { 1, &descRange[11] }; + + // CBufferUAV (indirect) + descRange[12] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 10, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[13].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[13].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[13].DescriptorTable = { 1, &descRange[12] }; + + // CountScatterArgs (indirect) + descRange[13] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 11, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[14].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[14].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[14].DescriptorTable = { 1, &descRange[13] }; + + // ReduceScanArgs (indirect) + descRange[14] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 12, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[15].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[15].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[15].DescriptorTable = { 1, &descRange[14] }; + + descRange[15] = { D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0, 0, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[16].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[16].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[16].DescriptorTable = { 1, &descRange[15] }; + + D3D12_ROOT_SIGNATURE_DESC rootSigDesc = {}; + rootSigDesc.NumParameters = 17; + rootSigDesc.pParameters = rootParams; + rootSigDesc.NumStaticSamplers = 0; + rootSigDesc.pStaticSamplers = nullptr; + rootSigDesc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE; + + ID3DBlob* pOutBlob, * pErrorBlob = nullptr; + ThrowIfFailed(D3D12SerializeRootSignature(&rootSigDesc, D3D_ROOT_SIGNATURE_VERSION_1, &pOutBlob, &pErrorBlob)); + ThrowIfFailed(pDevice->GetDevice()->CreateRootSignature(0, pOutBlob->GetBufferPointer(), pOutBlob->GetBufferSize(), IID_PPV_ARGS(&m_pFPSRootSignature))); + SetName(m_pFPSRootSignature, "FPS_Signature"); + + pOutBlob->Release(); + if (pErrorBlob) + pErrorBlob->Release(); + + // Also create the command signature for the indirect version + D3D12_INDIRECT_ARGUMENT_DESC dispatch = {}; + dispatch.Type = D3D12_INDIRECT_ARGUMENT_TYPE_DISPATCH; + D3D12_COMMAND_SIGNATURE_DESC desc = {}; + desc.ByteStride = sizeof(D3D12_DISPATCH_ARGUMENTS); + desc.NodeMask = 0; + desc.NumArgumentDescs = 1; + desc.pArgumentDescs = &dispatch; + + ThrowIfFailed(pDevice->GetDevice()->CreateCommandSignature(&desc, nullptr, IID_PPV_ARGS(&m_pFPSCommandSignature))); + m_pFPSCommandSignature->SetName(L"FPS_CommandSignature"); + } + + ////////////////////////////////////////////////////////////////////////// + // Create pipelines for radix sort + { + // Create all of the necessary pipelines for Sort and Scan + DefineList defines; + + // SetupIndirectParams (indirect only) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_SetupIndirectParameters", m_pFPSIndirectSetupParametersPipeline); + + // Radix count (sum table generation) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Count", m_pFPSCountPipeline); + // Radix count reduce (sum table reduction for offset prescan) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_CountReduce", m_pFPSCountReducePipeline); + // Radix scan (prefix scan) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Scan", m_pFPSScanPipeline); + // Radix scan add (prefix scan + reduced prefix scan addition) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_ScanAdd", m_pFPSScanAddPipeline); + // Radix scatter (key redistribution) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Scatter", m_pFPSScatterPipeline); + // Radix scatter with payload (key and payload redistribution) + defines["kRS_ValueCopy"] = std::to_string(1); + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Scatter", m_pFPSScatterPayloadPipeline); + } +} + +void FFXParallelSort::OnDestroy() +{ + // Release radix sort indirect resources + m_IndirectKeyCounts.OnDestroy(); + m_IndirectConstantBuffer.OnDestroy(); + m_IndirectCountScatterArgs.OnDestroy(); + m_IndirectReduceScanArgs.OnDestroy(); + m_pFPSCommandSignature->Release(); + m_pFPSIndirectSetupParametersPipeline->Release(); + + // Release radix sort algorithm resources + m_FPSScratchBuffer.OnDestroy(); + m_FPSReducedScratchBuffer.OnDestroy(); + m_pFPSRootSignature->Release(); + m_pFPSCountPipeline->Release(); + m_pFPSCountReducePipeline->Release(); + m_pFPSScanPipeline->Release(); + m_pFPSScanAddPipeline->Release(); + m_pFPSScatterPipeline->Release(); + m_pFPSScatterPayloadPipeline->Release(); + + // Release all of our resources + m_DstKeyTempBuffer[0].OnDestroy(); + m_DstKeyTempBuffer[1].OnDestroy(); + m_DstPayloadTempBuffer[0].OnDestroy(); + m_DstPayloadTempBuffer[1].OnDestroy(); +} + + +void FFXParallelSort::Draw(ID3D12GraphicsCommandList* pCommandList) +{ + bool bIndirectDispatch = true; + + std::string markerText = "FFXParallelSort"; + if (bIndirectDispatch) markerText += " Indirect"; + UserMarker marker(pCommandList, markerText.c_str()); + + FFX_ParallelSortCB constantBufferData = { 0 }; + + // Bind the descriptor heaps + ID3D12DescriptorHeap* pDescriptorHeap = m_pResourceViewHeaps->GetCBV_SRV_UAVHeap(); + pCommandList->SetDescriptorHeaps(1, &pDescriptorHeap); + + // Bind the root signature + pCommandList->SetComputeRootSignature(m_pFPSRootSignature); + + // Fill in the constant buffer data structure (this will be done by a shader in the indirect version) + uint32_t NumThreadgroupsToRun; + uint32_t NumReducedThreadgroupsToRun; + if (!bIndirectDispatch) + { + uint32_t NumberOfKeys = NumKeys; + FFX_ParallelSort_SetConstantAndDispatchData(NumberOfKeys, m_MaxNumThreadgroups, constantBufferData, NumThreadgroupsToRun, NumReducedThreadgroupsToRun); + } + else + { + struct SetupIndirectCB + { + uint32_t MaxThreadGroups; + }; + SetupIndirectCB IndirectSetupCB; + IndirectSetupCB.MaxThreadGroups = m_MaxNumThreadgroups; + + // Copy the data into the constant buffer + D3D12_GPU_VIRTUAL_ADDRESS constantBuffer = m_pConstantBufferRing->AllocConstantBuffer(sizeof(SetupIndirectCB), &IndirectSetupCB); + pCommandList->SetComputeRootConstantBufferView(1, constantBuffer); // SetupIndirect Constant buffer + + // Bind other buffer + pCommandList->SetComputeRootDescriptorTable(12, m_IndirectKeyCountsUAV.GetGPU()); // Key counts + pCommandList->SetComputeRootDescriptorTable(13, m_IndirectConstantBufferUAV.GetGPU()); // Indirect Sort Constant Buffer + pCommandList->SetComputeRootDescriptorTable(14, m_IndirectCountScatterArgsUAV.GetGPU()); // Indirect Sort Count/Scatter Args + pCommandList->SetComputeRootDescriptorTable(15, m_IndirectReduceScanArgsUAV.GetGPU()); // Indirect Sort Reduce/Scan Args + pCommandList->SetComputeRootDescriptorTable(16, m_ElementCountSRV.GetGPU()); // Indirect Sort Reduce/Scan Args + + // Dispatch + pCommandList->SetPipelineState(m_pFPSIndirectSetupParametersPipeline); + pCommandList->Dispatch(1, 1, 1); + + // When done, transition the args buffers to INDIRECT_ARGUMENT, and the constant buffer UAV to Constant buffer + CD3DX12_RESOURCE_BARRIER barriers[5]; + barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(m_IndirectCountScatterArgs.GetResource()); + barriers[1] = CD3DX12_RESOURCE_BARRIER::UAV(m_IndirectReduceScanArgs.GetResource()); + barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectConstantBuffer.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER); + barriers[3] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectCountScatterArgs.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT); + barriers[4] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectReduceScanArgs.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT); + pCommandList->ResourceBarrier(5, barriers); + } + + // Setup resource/UAV pairs to use during sort + RdxDX12ResourceInfo KeySrcInfo = { m_SrcKeyBuffer->GetResource(), m_SrcKeyUAV.GetGPU(0) }; + RdxDX12ResourceInfo PayloadSrcInfo = { m_SrcPayloadBuffer->GetResource(), m_SrcPayloadUAV.GetGPU(0) }; + RdxDX12ResourceInfo KeyTmpInfo = { m_DstKeyTempBuffer[1].GetResource(), m_DstKeyUAVTable.GetGPU(1) }; + RdxDX12ResourceInfo PayloadTmpInfo = { m_DstPayloadTempBuffer[1].GetResource(), m_DstPayloadUAVTable.GetGPU(1) }; + RdxDX12ResourceInfo ScratchBufferInfo = { m_FPSScratchBuffer.GetResource(), m_FPSScratchUAV.GetGPU() }; + RdxDX12ResourceInfo ReducedScratchBufferInfo = { m_FPSReducedScratchBuffer.GetResource(), m_FPSReducedScratchUAV.GetGPU() }; + + // Buffers to ping-pong between when writing out sorted values + const RdxDX12ResourceInfo* ReadBufferInfo(&KeySrcInfo), * WriteBufferInfo(&KeyTmpInfo); + const RdxDX12ResourceInfo* ReadPayloadBufferInfo(&PayloadSrcInfo), * WritePayloadBufferInfo(&PayloadTmpInfo); + bool bHasPayload = true; + + // Setup barriers for the run + CD3DX12_RESOURCE_BARRIER barriers[3]; + + // Perform Radix Sort (currently only support 32-bit key/payload sorting + for (uint32_t Shift = 0; Shift < 32u; Shift += FFX_PARALLELSORT_SORT_BITS_PER_PASS) + { + // Update the bit shift + pCommandList->SetComputeRoot32BitConstant(2, Shift, 0); + + // Copy the data into the constant buffer + D3D12_GPU_VIRTUAL_ADDRESS constantBuffer; + if (bIndirectDispatch) + constantBuffer = m_IndirectConstantBuffer.GetResource()->GetGPUVirtualAddress(); + else + constantBuffer = m_pConstantBufferRing->AllocConstantBuffer(sizeof(FFX_ParallelSortCB), &constantBufferData); + + // Bind to root signature + pCommandList->SetComputeRootConstantBufferView(0, constantBuffer); // Constant buffer + pCommandList->SetComputeRootDescriptorTable(3, ReadBufferInfo->resourceGPUHandle); // SrcBuffer + pCommandList->SetComputeRootDescriptorTable(5, ScratchBufferInfo.resourceGPUHandle); // Scratch buffer + + // Sort Count + { + pCommandList->SetPipelineState(m_pFPSCountPipeline); + + if (bIndirectDispatch) + { + pCommandList->ExecuteIndirect(m_pFPSCommandSignature, 1, m_IndirectCountScatterArgs.GetResource(), 0, nullptr, 0); + } + else + { + pCommandList->Dispatch(NumThreadgroupsToRun, 1, 1); + } + } + + // UAV barrier on the sum table + barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(ScratchBufferInfo.pResource); + pCommandList->ResourceBarrier(1, barriers); + + pCommandList->SetComputeRootDescriptorTable(6, ReducedScratchBufferInfo.resourceGPUHandle); // Scratch reduce buffer + + // Sort Reduce + { + pCommandList->SetPipelineState(m_pFPSCountReducePipeline); + + if (bIndirectDispatch) + { + pCommandList->ExecuteIndirect(m_pFPSCommandSignature, 1, m_IndirectReduceScanArgs.GetResource(), 0, nullptr, 0); + } + else + { + pCommandList->Dispatch(NumReducedThreadgroupsToRun, 1, 1); + } + + // UAV barrier on the reduced sum table + barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(ReducedScratchBufferInfo.pResource); + pCommandList->ResourceBarrier(1, barriers); + } + + // Sort Scan + { + // First do scan prefix of reduced values + pCommandList->SetComputeRootDescriptorTable(9, ReducedScratchBufferInfo.resourceGPUHandle); + pCommandList->SetComputeRootDescriptorTable(10, ReducedScratchBufferInfo.resourceGPUHandle); + + pCommandList->SetPipelineState(m_pFPSScanPipeline); + if (!bIndirectDispatch) + { + assert(NumReducedThreadgroupsToRun < FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE && "Need to account for bigger reduced histogram scan"); + } + pCommandList->Dispatch(1, 1, 1); + + // UAV barrier on the reduced sum table + barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(ReducedScratchBufferInfo.pResource); + pCommandList->ResourceBarrier(1, barriers); + + // Next do scan prefix on the histogram with partial sums that we just did + pCommandList->SetComputeRootDescriptorTable(9, ScratchBufferInfo.resourceGPUHandle); + pCommandList->SetComputeRootDescriptorTable(10, ScratchBufferInfo.resourceGPUHandle); + pCommandList->SetComputeRootDescriptorTable(11, ReducedScratchBufferInfo.resourceGPUHandle); + + pCommandList->SetPipelineState(m_pFPSScanAddPipeline); + if (bIndirectDispatch) + { + pCommandList->ExecuteIndirect(m_pFPSCommandSignature, 1, m_IndirectReduceScanArgs.GetResource(), 0, nullptr, 0); + } + else + { + pCommandList->Dispatch(NumReducedThreadgroupsToRun, 1, 1); + } + } + + // UAV barrier on the sum table + barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(ScratchBufferInfo.pResource); + pCommandList->ResourceBarrier(1, barriers); + + if (bHasPayload) + { + pCommandList->SetComputeRootDescriptorTable(4, ReadPayloadBufferInfo->resourceGPUHandle); // ScrPayload + pCommandList->SetComputeRootDescriptorTable(8, WritePayloadBufferInfo->resourceGPUHandle); // DstPayload + } + + pCommandList->SetComputeRootDescriptorTable(7, WriteBufferInfo->resourceGPUHandle); // DstBuffer + + // Sort Scatter + { + pCommandList->SetPipelineState(bHasPayload ? m_pFPSScatterPayloadPipeline : m_pFPSScatterPipeline); + + if (bIndirectDispatch) + { + pCommandList->ExecuteIndirect(m_pFPSCommandSignature, 1, m_IndirectCountScatterArgs.GetResource(), 0, nullptr, 0); + } + else + { + pCommandList->Dispatch(NumThreadgroupsToRun, 1, 1); + } + } + + // Finish doing everything and barrier for the next pass + int numBarriers = 0; + barriers[numBarriers++] = CD3DX12_RESOURCE_BARRIER::UAV(WriteBufferInfo->pResource); + if (bHasPayload) + barriers[numBarriers++] = CD3DX12_RESOURCE_BARRIER::UAV(WritePayloadBufferInfo->pResource); + pCommandList->ResourceBarrier(numBarriers, barriers); + + // Swap read/write sources + std::swap(ReadBufferInfo, WriteBufferInfo); + if (bHasPayload) + std::swap(ReadPayloadBufferInfo, WritePayloadBufferInfo); + } + + // When we are all done, transition indirect buffers back to UAV for the next frame (if doing indirect dispatch) + if (bIndirectDispatch) + { + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectCountScatterArgs.GetResource(), D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectReduceScanArgs.GetResource(), D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectConstantBuffer.GetResource(), D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + pCommandList->ResourceBarrier(3, barriers); + } +} \ No newline at end of file diff --git a/src/GpuParticles/dx12/ParallelSort.h b/src/GpuParticles/dx12/ParallelSort.h new file mode 100644 index 0000000..f1c1547 --- /dev/null +++ b/src/GpuParticles/dx12/ParallelSort.h @@ -0,0 +1,102 @@ +// ParallelSort.h +// +// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once +#include "../DX12/stdafx.h" + +#define SORT_BITS_PER_PASS 4 +#define SORT_BIN_COUNT (1 << SORT_BITS_PER_PASS) +#define THREADGROUP_SIZE 64 +#define ELEMENTS_PER_THREAD 4 // (256 / THREADGROUP_SIZE) +#define ITEMS_PER_WI 16 +#define INV_ITEMS_PER_WI 1/16 + +struct ParallelSortRenderCB // If you change this, also change struct ParallelSortRenderCB in ParallelSortVerify.hlsl +{ + int32_t Width; + int32_t Height; + int32_t SortWidth; + int32_t SortHeight; +}; + +// Convenience struct for passing resource/UAV pairs around +typedef struct RdxDX12ResourceInfo +{ + ID3D12Resource* pResource; ///< Pointer to the resource -- used for barriers and syncs (must NOT be nullptr) + D3D12_GPU_DESCRIPTOR_HANDLE resourceGPUHandle; ///< The GPU Descriptor Handle to use for binding the resource +} RdxDX12ResourceInfo; + +class FFXParallelSort +{ +public: + void OnCreate(Device* pDevice, ResourceViewHeaps* pResourceViewHeaps, DynamicBufferRing* pConstantBufferRing, UploadHeap* pUploadHeap, Texture* elementCount, Texture* listA, Texture* listB); + void OnDestroy(); + + void Draw(ID3D12GraphicsCommandList* pCommandList); + +private: + + void CompileRadixPipeline(const char* shaderFile, const DefineList* defines, const char* entryPoint, ID3D12PipelineState*& pPipeline); + + Device* m_pDevice = nullptr; + UploadHeap* m_pUploadHeap = nullptr; + ResourceViewHeaps* m_pResourceViewHeaps = nullptr; + DynamicBufferRing* m_pConstantBufferRing = nullptr; + uint32_t m_MaxNumThreadgroups = 320; // Use a generic thread group size when not on AMD hardware (taken from experiments to determine best performance threshold) + + // Sample resources + Texture* m_SrcKeyBuffer = nullptr; + Texture* m_SrcPayloadBuffer = nullptr; + CBV_SRV_UAV m_ElementCountSRV; + CBV_SRV_UAV m_SrcKeyUAV; // 32 bit source key UAVs + CBV_SRV_UAV m_SrcPayloadUAV; // 32 bit source payload UAVs + + Texture m_DstKeyTempBuffer[ 2 ]; + CBV_SRV_UAV m_DstKeyUAVTable; // 32 bit destination key UAVs + + Texture m_DstPayloadTempBuffer[ 2 ]; + CBV_SRV_UAV m_DstPayloadUAVTable; // 32 bit destination payload UAVs + + // Resources for parallel sort algorithm + Texture m_FPSScratchBuffer; // Sort scratch buffer + CBV_SRV_UAV m_FPSScratchUAV; // UAV needed for sort scratch buffer + Texture m_FPSReducedScratchBuffer; // Sort reduced scratch buffer + CBV_SRV_UAV m_FPSReducedScratchUAV; // UAV needed for sort reduced scratch buffer + + ID3D12RootSignature* m_pFPSRootSignature = nullptr; + ID3D12PipelineState* m_pFPSCountPipeline = nullptr; + ID3D12PipelineState* m_pFPSCountReducePipeline = nullptr; + ID3D12PipelineState* m_pFPSScanPipeline = nullptr; + ID3D12PipelineState* m_pFPSScanAddPipeline = nullptr; + ID3D12PipelineState* m_pFPSScatterPipeline = nullptr; + ID3D12PipelineState* m_pFPSScatterPayloadPipeline = nullptr; + + // Resources for indirect execution of algorithm + Texture m_IndirectKeyCounts; // Buffer to hold num keys for indirect dispatch + CBV_SRV_UAV m_IndirectKeyCountsUAV; // UAV needed for num keys buffer + Texture m_IndirectConstantBuffer; // Buffer to hold radix sort constant buffer data for indirect dispatch + CBV_SRV_UAV m_IndirectConstantBufferUAV; // UAV needed for indirect constant buffer + Texture m_IndirectCountScatterArgs; // Buffer to hold dispatch arguments used for Count/Scatter parts of the algorithm + CBV_SRV_UAV m_IndirectCountScatterArgsUAV; // UAV needed for count/scatter args buffer + Texture m_IndirectReduceScanArgs; // Buffer to hold dispatch arguments used for Reduce/Scan parts of the algorithm + CBV_SRV_UAV m_IndirectReduceScanArgsUAV; // UAV needed for reduce/scan args buffer + + ID3D12CommandSignature* m_pFPSCommandSignature; + ID3D12PipelineState* m_pFPSIndirectSetupParametersPipeline = nullptr; +}; \ No newline at end of file diff --git a/src/GpuParticles/vk/BufferHelper.h b/src/GpuParticles/vk/BufferHelper.h new file mode 100644 index 0000000..b81f300 --- /dev/null +++ b/src/GpuParticles/vk/BufferHelper.h @@ -0,0 +1,179 @@ +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +#pragma once + +#include "../vk/stdafx.h" + + +namespace CAULDRON_VK +{ + +// For adding markers in RGP +class UserMarker +{ +public: + UserMarker(VkCommandBuffer commandBuffer, const char* name) : m_commandBuffer( commandBuffer ) { SetPerfMarkerBegin(m_commandBuffer, name); } + ~UserMarker() { SetPerfMarkerEnd(m_commandBuffer); } + +private: + VkCommandBuffer m_commandBuffer; +}; + + +size_t FormatSize(VkFormat format); + + +class Buffer +{ +public: + Buffer() {} + virtual ~Buffer() {} + virtual void OnDestroy() + { + if (m_bufferView) + { + vkDestroyBufferView(m_pDevice->GetDevice(), m_bufferView, nullptr); + m_bufferView = VK_NULL_HANDLE; + } + + if (m_buffer) + { + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_buffer, m_alloc); + m_buffer = VK_NULL_HANDLE; + } + m_pDevice = nullptr; + m_sizeInBytes = 0; + } + + bool Init(Device *pDevice, int numElements, VkFormat format, const char* name) + { + m_pDevice = pDevice; + m_sizeInBytes = numElements * FormatSize( format ); + VmaAllocationCreateInfo bufferAllocCreateInfo = {}; + bufferAllocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; + bufferAllocCreateInfo.flags = VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT; + bufferAllocCreateInfo.pUserData = (void*)name; + VmaAllocationInfo gpuAllocInfo = {}; + + VkBufferCreateInfo bufferInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; + bufferInfo.size = m_sizeInBytes; + bufferInfo.usage = VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT; + + VkResult res = vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferInfo, &bufferAllocCreateInfo, &m_buffer, &m_alloc, &gpuAllocInfo); + assert(res == VK_SUCCESS); + SetResourceName(pDevice->GetDevice(), VK_OBJECT_TYPE_BUFFER, (uint64_t)m_buffer, name); + + VkBufferViewCreateInfo viewInfo = {}; + viewInfo.sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO; + viewInfo.format = format; + viewInfo.buffer = m_buffer; + viewInfo.range = m_sizeInBytes; + vkCreateBufferView(pDevice->GetDevice(), &viewInfo, nullptr, &m_bufferView); + SetResourceName(m_pDevice->GetDevice(), VK_OBJECT_TYPE_BUFFER_VIEW, (uint64_t)m_bufferView, name); + return true; + } + + bool Init(Device *pDevice, int numElements, size_t structSize, const char* name, bool indirectArgs) + { + m_pDevice = pDevice; + m_sizeInBytes = numElements * structSize; + VmaAllocationCreateInfo bufferAllocCreateInfo = {}; + bufferAllocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; + bufferAllocCreateInfo.flags = VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT; + bufferAllocCreateInfo.pUserData = (void*)name; + VmaAllocationInfo gpuAllocInfo = {}; + + VkBufferCreateInfo bufferInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; + bufferInfo.size = m_sizeInBytes; + bufferInfo.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + if ( indirectArgs ) + bufferInfo.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; + + VkResult res = vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferInfo, &bufferAllocCreateInfo, &m_buffer, &m_alloc, &gpuAllocInfo); + assert(res == VK_SUCCESS); + SetResourceName(pDevice->GetDevice(), VK_OBJECT_TYPE_BUFFER, (uint64_t)m_buffer, name); + + return true; + } + + VkBuffer& Resource() { return m_buffer; } + + void SetDescriptorSet(int index, VkDescriptorSet descriptorSet, bool asUAV) const + { + VkDescriptorBufferInfo descriptorBufferInfo = {}; + descriptorBufferInfo.buffer = m_buffer; + descriptorBufferInfo.range = m_sizeInBytes; + + VkWriteDescriptorSet write = {}; + write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + write.dstSet = descriptorSet; + write.descriptorCount = 1; + if ( m_bufferView ) + { + write.descriptorType = asUAV ? VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER : VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER; + write.pTexelBufferView = &m_bufferView; + } + else + { + write.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + write.pBufferInfo = &descriptorBufferInfo; + } + write.dstBinding = index; + vkUpdateDescriptorSets(m_pDevice->GetDevice(), 1, &write, 0, nullptr); + } + + void PipelineBarrier( VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask ) + { + VkBufferMemoryBarrier memoryBarrier = {}; + memoryBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + memoryBarrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + memoryBarrier.dstAccessMask = dstStageMask == VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT ? VK_ACCESS_INDIRECT_COMMAND_READ_BIT : VK_ACCESS_SHADER_READ_BIT; + memoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + memoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + memoryBarrier.buffer = m_buffer; + memoryBarrier.size = m_sizeInBytes; + vkCmdPipelineBarrier( commandBuffer, srcStageMask, dstStageMask, VK_DEPENDENCY_BY_REGION_BIT, 0, nullptr, 1, &memoryBarrier, 0, nullptr ); + } + + void AddPipelineBarrier( std::vector& barrierList, VkPipelineStageFlags dstStageMask ) + { + VkBufferMemoryBarrier memoryBarrier = {}; + memoryBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + memoryBarrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; + memoryBarrier.dstAccessMask = dstStageMask == VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT ? VK_ACCESS_INDIRECT_COMMAND_READ_BIT : VK_ACCESS_SHADER_READ_BIT; + memoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + memoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + memoryBarrier.buffer = m_buffer; + memoryBarrier.size = m_sizeInBytes; + barrierList.push_back( memoryBarrier ); + } + +private: + + Device* m_pDevice = nullptr; + VmaAllocation m_alloc = VK_NULL_HANDLE; + VkBuffer m_buffer = VK_NULL_HANDLE; + size_t m_sizeInBytes = 0; + VkBufferView m_bufferView = VK_NULL_HANDLE; +}; + +} \ No newline at end of file diff --git a/src/GpuParticles/vk/GPUParticleSystem.cpp b/src/GpuParticles/vk/GPUParticleSystem.cpp new file mode 100644 index 0000000..0bd26ee --- /dev/null +++ b/src/GpuParticles/vk/GPUParticleSystem.cpp @@ -0,0 +1,944 @@ +// +// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// +#include "../vk/stdafx.h" +#include "BufferHelper.h" +#include "../ParticleSystem.h" +#include "../ParticleHelpers.h" +#include "../ParticleSystemInternal.h" +#include "ParallelSort.h" +#include "base/ExtDebugUtils.h" + + +size_t CAULDRON_VK::FormatSize(VkFormat format) +{ + switch (format) + { + case VK_FORMAT_R8_SINT: return 1;//(BYTE) + case VK_FORMAT_R8_UINT: return 1;//(UNSIGNED_BYTE)1 + case VK_FORMAT_R16_SINT: return 2;//(SHORT)2 + case VK_FORMAT_R16_UINT: return 2;//(UNSIGNED_SHORT)2 + case VK_FORMAT_R32_SINT: return 4;//(SIGNED_INT)4 + case VK_FORMAT_R32_UINT: return 4;//(UNSIGNED_INT)4 + case VK_FORMAT_R32_SFLOAT: return 4;//(FLOAT) + + case VK_FORMAT_R8G8_SINT: return 2 * 1;//(BYTE) + case VK_FORMAT_R8G8_UINT: return 2 * 1;//(UNSIGNED_BYTE)1 + case VK_FORMAT_R16G16_SINT: return 2 * 2;//(SHORT)2 + case VK_FORMAT_R16G16_UINT: return 2 * 2; // (UNSIGNED_SHORT)2 + case VK_FORMAT_R32G32_SINT: return 2 * 4;//(SIGNED_INT)4 + case VK_FORMAT_R32G32_UINT: return 2 * 4;//(UNSIGNED_INT)4 + case VK_FORMAT_R32G32_SFLOAT: return 2 * 4;//(FLOAT) + + case VK_FORMAT_UNDEFINED: return 0;//(BYTE) (UNSIGNED_BYTE) (SHORT) (UNSIGNED_SHORT) + case VK_FORMAT_R32G32B32_SINT: return 3 * 4;//(SIGNED_INT)4 + case VK_FORMAT_R32G32B32_UINT: return 3 * 4;//(UNSIGNED_INT)4 + case VK_FORMAT_R32G32B32_SFLOAT: return 3 * 4;//(FLOAT) + + case VK_FORMAT_R8G8B8A8_SINT: return 4 * 1;//(BYTE) + case VK_FORMAT_R8G8B8A8_UINT: return 4 * 1;//(UNSIGNED_BYTE)1 + case VK_FORMAT_R16G16B16A16_SINT: return 4 * 2;//(SHORT)2 + case VK_FORMAT_R16G16B16A16_UINT: return 4 * 2;//(UNSIGNED_SHORT)2 + case VK_FORMAT_R32G32B32A32_SINT: return 4 * 4;//(SIGNED_INT)4 + case VK_FORMAT_R32G32B32A32_UINT: return 4 * 4;//(UNSIGNED_INT)4 + case VK_FORMAT_R32G32B32A32_SFLOAT: return 4 * 4;//(FLOAT) + + case VK_FORMAT_R16G16B16A16_SFLOAT: return 4 * 2; + } + + assert(0); + return 0; +} + +#pragma warning( disable : 4100 ) // disable unreference formal parameter warnings for /W4 builds + +struct IndirectCommand +{ + int args[ 5 ]; +}; + +// GPU Particle System class. Responsible for updating and rendering the particles +class GPUParticleSystem : public IParticleSystem +{ +public: + + GPUParticleSystem( const char* particleAtlas ); + +private: + + enum DepthCullingMode + { + DepthCullingOn, + DepthCullingOff, + NumDepthCullingModes + }; + + enum StreakMode + { + StreaksOn, + StreaksOff, + NumStreakModes + }; + + enum ReactiveMode + { + ReactiveOn, + ReactiveOff, + NumReactiveModes + }; + + virtual ~GPUParticleSystem(); + + virtual void OnCreateDevice( Device& device, UploadHeap& uploadHeap, ResourceViewHeaps& heaps, StaticBufferPool& bufferPool, DynamicBufferRing& constantBufferRing, VkRenderPass renderPass ); + virtual void OnResizedSwapChain( int width, int height, Texture& depthBuffer, VkFramebuffer frameBuffer ); + virtual void OnReleasingSwapChain(); + virtual void OnDestroyDevice(); + + virtual void Reset(); + + virtual void Render( VkCommandBuffer commandBuffer, DynamicBufferRing& constantBufferRing, int flags, const EmitterParams* pEmitters, int nNumEmitters, const ConstantData& constantData ); + + void Emit( VkCommandBuffer commandBuffer, DynamicBufferRing& constantBufferRing, uint32_t perFrameConstantOffset, int numEmitters, const EmitterParams* emitters ); + void Simulate( VkCommandBuffer commandBuffer ); + void Sort( VkCommandBuffer commandBuffer ); + + void FillRandomTexture( UploadHeap& uploadHeap ); + void CreateSimulationAssets( DynamicBufferRing& constantBufferRing ); + void CreateRasterizedRenderingAssets( DynamicBufferRing& constantBufferRing ); + + VkPipeline CreatePipeline( const char* filename, const char* entry, VkPipelineLayout layout, const DefineList* defines ); + + Device* m_pDevice = nullptr; + ResourceViewHeaps* m_heaps = nullptr; + const char* m_AtlasPath = nullptr; + VkRenderPass m_renderPass = VK_NULL_HANDLE; + VkFramebuffer m_frameBuffer = VK_NULL_HANDLE; + + Texture m_Atlas = {}; + VkImageView m_AtlasSRV = {}; + Buffer m_ParticleBufferA = {}; + Buffer m_ParticleBufferB = {}; + Buffer m_PackedViewSpaceParticlePositions = {}; + Buffer m_MaxRadiusBuffer = {}; + Buffer m_DeadListBuffer = {}; + Buffer m_AliveCountBuffer = {}; + Buffer m_AliveIndexBuffer = {}; + Buffer m_AliveDistanceBuffer = {}; + Buffer m_DstAliveIndexBuffer = {}; // working memory for the Radix sorter + Buffer m_DstAliveDistanceBuffer = {}; // working memory for the Radix sorter + Buffer m_IndirectArgsBuffer = {}; + + Texture m_RandomTexture = {}; + VkImageView m_RandomTextureSRV = {}; + + VkImage m_DepthBuffer = {}; + VkImageView m_DepthBufferSRV = {}; + + VkDescriptorSetLayout m_SimulationDescriptorSetLayout = VK_NULL_HANDLE; + VkDescriptorSet m_SimulationDescriptorSet = VK_NULL_HANDLE; + + VkDescriptorSetLayout m_RasterizationDescriptorSetLayout = VK_NULL_HANDLE; + VkDescriptorSet m_RasterizationDescriptorSet = VK_NULL_HANDLE; + + VkSampler m_samplers[ 3 ] = {}; + + UINT m_ScreenWidth = 0; + UINT m_ScreenHeight = 0; + float m_InvScreenWidth = 0.0f; + float m_InvScreenHeight = 0.0f; + float m_ElapsedTime = 0.0f; + float m_AlphaThreshold = 0.97f; + + VkDescriptorBufferInfo m_IndexBuffer = {}; + + VkPipelineLayout m_SimulationPipelineLayout = VK_NULL_HANDLE; + VkPipelineLayout m_RasterizationPipelineLayout = VK_NULL_HANDLE; + + VkPipeline m_SimulationPipeline = VK_NULL_HANDLE; + VkPipeline m_EmitPipeline = VK_NULL_HANDLE; + VkPipeline m_ResetParticlesPipeline = VK_NULL_HANDLE; + VkPipeline m_RasterizationPipelines[ NumStreakModes ][ NumReactiveModes ] = {}; + + bool m_ResetSystem = true; + FFXParallelSort m_SortLib = {}; +}; + + +IParticleSystem* IParticleSystem::CreateGPUSystem( const char* particleAtlas ) +{ + return new GPUParticleSystem( particleAtlas ); +} + + +GPUParticleSystem::GPUParticleSystem( const char* particleAtlas ) : m_AtlasPath( particleAtlas ) {} +GPUParticleSystem::~GPUParticleSystem() {} + + +// Use the sort lib to perform a bitonic sort over the particle indices based on their distance from camera +void GPUParticleSystem::Sort( VkCommandBuffer commandBuffer ) +{ + m_SortLib.Draw( commandBuffer ); +} + + +void GPUParticleSystem::Reset() +{ + m_ResetSystem = true; +} + + +void GPUParticleSystem::Render( VkCommandBuffer commandBuffer, DynamicBufferRing& constantBufferRing, int flags, const EmitterParams* pEmitters, int nNumEmitters, const ConstantData& constantData ) +{ + SimulationConstantBuffer simulationConstants = {}; + + memcpy( simulationConstants.m_StartColor, constantData.m_StartColor, sizeof( simulationConstants.m_StartColor ) ); + memcpy( simulationConstants.m_EndColor, constantData.m_EndColor, sizeof( simulationConstants.m_EndColor ) ); + memcpy( simulationConstants.m_EmitterLightingCenter, constantData.m_EmitterLightingCenter, sizeof( simulationConstants.m_EmitterLightingCenter ) ); + + simulationConstants.m_ViewProjection = constantData.m_ViewProjection; + simulationConstants.m_View = constantData.m_View; + simulationConstants.m_ViewInv = constantData.m_ViewInv; + simulationConstants.m_ProjectionInv = constantData.m_ProjectionInv; + + simulationConstants.m_EyePosition = constantData.m_ViewInv.getCol3(); + simulationConstants.m_SunDirection = constantData.m_SunDirection; + + simulationConstants.m_ScreenWidth = m_ScreenWidth; + simulationConstants.m_ScreenHeight = m_ScreenHeight; + simulationConstants.m_MaxParticles = g_maxParticles; + simulationConstants.m_FrameTime = constantData.m_FrameTime; + + math::Vector4 sunDirectionVS = constantData.m_View * constantData.m_SunDirection; + + m_ElapsedTime += constantData.m_FrameTime; + if ( m_ElapsedTime > 10.0f ) + m_ElapsedTime -= 10.0f; + + simulationConstants.m_ElapsedTime = m_ElapsedTime; + + void* data = nullptr; + VkDescriptorBufferInfo constantBuffer = {}; + constantBufferRing.AllocConstantBuffer( sizeof( simulationConstants ), &data, &constantBuffer ); + memcpy( data, &simulationConstants, sizeof( simulationConstants ) ); + + { + uint32_t uniformOffsets[] = { (uint32_t)constantBuffer.offset, 0 }; + vkCmdBindDescriptorSets( commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, m_SimulationPipelineLayout, 0, 1, &m_SimulationDescriptorSet, _countof( uniformOffsets ), uniformOffsets ); + + + UserMarker marker( commandBuffer, "simulation" ); + + // If we are resetting the particle system, then initialize the dead list + if ( m_ResetSystem ) + { + vkCmdBindPipeline( commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, m_ResetParticlesPipeline ); + + // Disaptch a set of 1d thread groups to fill out the dead list, one thread per particle + vkCmdDispatch( commandBuffer, align( g_maxParticles, 256 ) / 256, 1, 1 ); + + std::vector barriers = {}; + m_ParticleBufferA.AddPipelineBarrier( barriers, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); + m_ParticleBufferB.AddPipelineBarrier( barriers, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); + m_DeadListBuffer.AddPipelineBarrier( barriers, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); + vkCmdPipelineBarrier( commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_DEPENDENCY_BY_REGION_BIT, 0, nullptr, (uint32_t)barriers.size(), &barriers[ 0 ], 0, nullptr ); + + m_ResetSystem = false; + } + + // Emit particles into the system + Emit( commandBuffer, constantBufferRing, (uint32_t)constantBuffer.offset, nNumEmitters, pEmitters ); + + // Run the simulation for this frame + Simulate( commandBuffer ); + + std::vector barriersAfterSimulation = {}; + m_ParticleBufferA.AddPipelineBarrier( barriersAfterSimulation, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); + m_PackedViewSpaceParticlePositions.AddPipelineBarrier( barriersAfterSimulation, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); + m_MaxRadiusBuffer.AddPipelineBarrier( barriersAfterSimulation, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); + m_DeadListBuffer.AddPipelineBarrier( barriersAfterSimulation, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); + m_AliveCountBuffer.AddPipelineBarrier( barriersAfterSimulation, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); + + VkImageMemoryBarrier barrier = {}; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT; + barrier.oldLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + barrier.newLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + barrier.image = m_DepthBuffer; + + vkCmdPipelineBarrier( commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT, VK_DEPENDENCY_BY_REGION_BIT, 0, nullptr, (uint32_t)barriersAfterSimulation.size(), &barriersAfterSimulation[ 0 ], 1, &barrier ); + } + + { + UserMarker marker( commandBuffer, "rasterization" ); + + // Sort if requested. Not doing so results in the particles rendering out of order and not blending correctly + if ( flags & PF_Sort ) + { + UserMarker marker( commandBuffer, "sorting" ); + + std::vector barriers = {}; + m_AliveIndexBuffer.AddPipelineBarrier( barriers, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); + m_AliveDistanceBuffer.AddPipelineBarrier( barriers, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); + vkCmdPipelineBarrier( commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_DEPENDENCY_BY_REGION_BIT, 0, nullptr, (uint32_t)barriers.size(), &barriers[ 0 ], 0, nullptr ); + + Sort( commandBuffer ); + } + + std::vector barriers = {}; + m_AliveIndexBuffer.AddPipelineBarrier( barriers, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT ); + m_IndirectArgsBuffer.AddPipelineBarrier( barriers, VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT ); + vkCmdPipelineBarrier( commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, VK_DEPENDENCY_BY_REGION_BIT, 0, nullptr, (uint32_t)barriers.size(), &barriers[ 0 ], 0, nullptr ); + + RenderingConstantBuffer* cb = nullptr; + VkDescriptorBufferInfo constantBuffer = {}; + constantBufferRing.AllocConstantBuffer( sizeof( RenderingConstantBuffer ), (void**)&cb, &constantBuffer ); + cb->m_Projection = constantData.m_Projection; + cb->m_ProjectionInv = simulationConstants.m_ProjectionInv; + cb->m_SunColor = constantData.m_SunColor; + cb->m_AmbientColor = constantData.m_AmbientColor; + cb->m_SunDirectionVS = sunDirectionVS; + cb->m_ScreenWidth = m_ScreenWidth; + cb->m_ScreenHeight = m_ScreenHeight; + + uint32_t uniformOffsets[1] = { (uint32_t)constantBuffer.offset }; + vkCmdBindDescriptorSets( commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, m_RasterizationPipelineLayout, 0, 1, &m_RasterizationDescriptorSet, 1, uniformOffsets ); + + VkRenderPassBeginInfo renderPassBegin = {}; + renderPassBegin.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; + renderPassBegin.renderPass = m_renderPass; + renderPassBegin.framebuffer = m_frameBuffer; + renderPassBegin.renderArea.extent.width = m_ScreenWidth; + renderPassBegin.renderArea.extent.height = m_ScreenHeight; + + vkCmdBeginRenderPass( commandBuffer, &renderPassBegin, VK_SUBPASS_CONTENTS_INLINE ); + + StreakMode streaks = flags & PF_Streaks ? StreaksOn : StreaksOff; + ReactiveMode reactive = flags & PF_Reactive ? ReactiveOn : ReactiveOff; + + vkCmdBindPipeline( commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, m_RasterizationPipelines[ streaks ][ reactive ] ); + + vkCmdBindIndexBuffer( commandBuffer, m_IndexBuffer.buffer, m_IndexBuffer.offset, VK_INDEX_TYPE_UINT32 ); + + vkCmdDrawIndexedIndirect( commandBuffer, m_IndirectArgsBuffer.Resource(), 0, 1, sizeof( IndirectCommand ) ); + + vkCmdEndRenderPass( commandBuffer ); + } +} + + +void GPUParticleSystem::OnCreateDevice( Device& device, UploadHeap& uploadHeap, ResourceViewHeaps& heaps, StaticBufferPool& bufferPool, DynamicBufferRing& constantBufferRing, VkRenderPass renderPass ) +{ + m_pDevice = &device; + m_heaps = &heaps; + m_renderPass = renderPass; + + VkSamplerCreateInfo sampler = {}; + sampler.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; + sampler.minLod = 0.0f; + sampler.maxLod = FLT_MAX; + sampler.mipLodBias = 0.0f; + sampler.borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE; + sampler.compareEnable = VK_FALSE; + sampler.compareOp = VK_COMPARE_OP_NEVER; + sampler.maxAnisotropy = 1.0f; + sampler.anisotropyEnable = VK_FALSE; + + for ( int i = 0; i < 3; i++ ) + { + if ( i == 1 ) + { + sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + sampler.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + sampler.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + } + else + { + sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT; + sampler.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT; + sampler.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT; + } + + if ( i == 2 ) + { + sampler.magFilter = VK_FILTER_NEAREST; + sampler.minFilter = VK_FILTER_NEAREST; + sampler.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; + } + else + { + sampler.magFilter = VK_FILTER_LINEAR; + sampler.minFilter = VK_FILTER_LINEAR; + sampler.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR; + } + + vkCreateSampler( m_pDevice->GetDevice(), &sampler, nullptr, &m_samplers[ i ] ); + } + + // Create the global particle pool. Each particle is split into two parts for better cache coherency. The first half contains the data more + // relevant to rendering while the second half is more related to simulation + m_ParticleBufferA.Init( m_pDevice, g_maxParticles, sizeof( GPUParticlePartA ), "ParticleBufferA", false ); + m_ParticleBufferB.Init( m_pDevice, g_maxParticles, sizeof( GPUParticlePartB ), "ParticleBufferB", false ); + + // The packed view space positions of particles are cached during simulation so allocate a buffer for them + m_PackedViewSpaceParticlePositions.Init( m_pDevice, g_maxParticles, sizeof( UINT ) * 2, "PackedViewSpaceParticlePositions", false ); + + // The maximum radii of each particle is cached during simulation to avoid recomputing multiple times later. This is only required + // for streaked particles as they are not round so we cache the max radius of X and Y + m_MaxRadiusBuffer.Init( m_pDevice, g_maxParticles, 4, "MaxRadiusBuffer", false ); + + // The dead particle index list. Created as an append buffer + m_DeadListBuffer.Init( m_pDevice, g_maxParticles + 1, 4, "DeadListBuffer", false ); + + // Create the buffer to hold the number of alive particles + m_AliveCountBuffer.Init( m_pDevice, 1, 4, "AliveCountBuffer", false ); + + // Create the index buffer of alive particles that is to be sorted (at least in the rasterization path). + m_AliveIndexBuffer.Init( m_pDevice, g_maxParticles, 4, "AliveIndexBuffer", false ); + m_DstAliveIndexBuffer.Init( m_pDevice, g_maxParticles, 4, "DstAliveIndexBuffer", false ); + + // Create the list of distances of each alive particle - used for sorting in the rasterization path. + m_AliveDistanceBuffer.Init( m_pDevice, g_maxParticles, 4, "AliveDistanceBuffer", false ); + m_DstAliveDistanceBuffer.Init( m_pDevice, g_maxParticles, 4, "DstAliveDistanceBuffer", false ); + + // Create the buffer to store the indirect args for the ExecuteIndirect call + // Create the index buffer of alive particles that is to be sorted (at least in the rasterization path). + m_IndirectArgsBuffer.Init( m_pDevice, 1, sizeof( IndirectCommand ), "IndirectArgsBuffer", true ); + + // Create the particle billboard index buffer required for the rasterization VS-only path + UINT* indices = new UINT[ g_maxParticles * 6 ]; + UINT* ptr = indices; + UINT base = 0; + for ( int i = 0; i < g_maxParticles; i++ ) + { + ptr[ 0 ] = base + 0; + ptr[ 1 ] = base + 1; + ptr[ 2 ] = base + 2; + + ptr[ 3 ] = base + 2; + ptr[ 4 ] = base + 1; + ptr[ 5 ] = base + 3; + + base += 4; + ptr += 6; + } + + bufferPool.AllocBuffer( g_maxParticles * 6, sizeof( UINT ), indices, &m_IndexBuffer ); + delete[] indices; + + // Initialize the random numbers texture + FillRandomTexture( uploadHeap ); + + m_Atlas.InitFromFile( &device, &uploadHeap, m_AtlasPath, true ); + m_Atlas.CreateSRV( &m_AtlasSRV ); + + CreateSimulationAssets( constantBufferRing ); + CreateRasterizedRenderingAssets( constantBufferRing ); + + // Create the SortLib resources + m_SortLib.OnCreate( &device, &heaps, &constantBufferRing, &uploadHeap, &m_AliveCountBuffer, &m_AliveDistanceBuffer, &m_AliveIndexBuffer, &m_DstAliveDistanceBuffer, &m_DstAliveIndexBuffer ); +} + + +VkPipeline GPUParticleSystem::CreatePipeline( const char* filename, const char* entry, VkPipelineLayout layout, const DefineList* defines ) +{ + VkPipelineShaderStageCreateInfo computeShader = {}; + VkResult res = VKCompileFromFile( m_pDevice->GetDevice(), VK_SHADER_STAGE_COMPUTE_BIT, filename, entry, "-T cs_6_0", defines, &computeShader ); + assert(res == VK_SUCCESS); + + VkComputePipelineCreateInfo pipelineInfo = {}; + pipelineInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + pipelineInfo.layout = layout; + pipelineInfo.stage = computeShader; + + VkPipeline pipeline = {}; + res = vkCreateComputePipelines( m_pDevice->GetDevice(), m_pDevice->GetPipelineCache(), 1, &pipelineInfo, nullptr, &pipeline ); + assert(res == VK_SUCCESS); + SetResourceName( m_pDevice->GetDevice(), VK_OBJECT_TYPE_PIPELINE, (uint64_t)pipeline, entry ); + + return pipeline; +} + + +void GPUParticleSystem::CreateSimulationAssets( DynamicBufferRing& constantBufferRing ) +{ + // 0 - g_ParticleBufferA + // 1 - g_ParticleBufferB + // 2 - g_DeadList + // 3 - g_IndexBuffer + // 4 - g_DistanceBuffer + // 5 - g_MaxRadiusBuffer + // 6 - g_PackedViewSpacePositions + // 7 - g_DrawArgs + // 8 - g_AliveParticleCount + // 9 - g_DepthBuffer + // 10 - g_RandomBuffer + // 11 - PerFrameConstantBuffer + // 12 - EmitterConstantBuffer + // 13 - g_samWrapPoint + + std::vector layout_bindings( 14 ); + int binding = 0; + for ( int i = 0; i < 9; i++ ) + { + layout_bindings[binding].binding = binding; + layout_bindings[binding].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + layout_bindings[binding].descriptorCount = 1; + layout_bindings[binding].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + layout_bindings[binding].pImmutableSamplers = nullptr; + binding++; + } + + for ( int i = 0; i < 2; i++ ) + { + layout_bindings[binding].binding = binding; + layout_bindings[binding].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + layout_bindings[binding].descriptorCount = 1; + layout_bindings[binding].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + layout_bindings[binding].pImmutableSamplers = nullptr; + binding++; + } + for ( int i = 0; i < 2; i++ ) + { + layout_bindings[binding].binding = binding; + layout_bindings[binding].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; + layout_bindings[binding].descriptorCount = 1; + layout_bindings[binding].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + layout_bindings[binding].pImmutableSamplers = nullptr; + binding++; + } + + { + layout_bindings[binding].binding = binding; + layout_bindings[binding].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER; + layout_bindings[binding].descriptorCount = 1; + layout_bindings[binding].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + layout_bindings[binding].pImmutableSamplers = &m_samplers[ 2 ]; + binding++; + } + + assert( binding == layout_bindings.size() ); + + m_heaps->CreateDescriptorSetLayoutAndAllocDescriptorSet( &layout_bindings, &m_SimulationDescriptorSetLayout, &m_SimulationDescriptorSet ); + constantBufferRing.SetDescriptorSet( 11, sizeof( SimulationConstantBuffer ), m_SimulationDescriptorSet ); + constantBufferRing.SetDescriptorSet( 12, sizeof( EmitterConstantBuffer ), m_SimulationDescriptorSet ); + + // Create pipeline layout + // + + VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = {}; + pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + pipelineLayoutCreateInfo.setLayoutCount = 1; + pipelineLayoutCreateInfo.pSetLayouts = &m_SimulationDescriptorSetLayout; + + VkResult res = vkCreatePipelineLayout( m_pDevice->GetDevice(), &pipelineLayoutCreateInfo, nullptr, &m_SimulationPipelineLayout ); + assert(res == VK_SUCCESS); + + m_ParticleBufferA.SetDescriptorSet( 0, m_SimulationDescriptorSet, true ); + m_ParticleBufferB.SetDescriptorSet( 1, m_SimulationDescriptorSet, true ); + m_DeadListBuffer.SetDescriptorSet( 2, m_SimulationDescriptorSet, true ); + m_AliveIndexBuffer.SetDescriptorSet( 3, m_SimulationDescriptorSet, true ); + m_AliveDistanceBuffer.SetDescriptorSet( 4, m_SimulationDescriptorSet, true ); + m_MaxRadiusBuffer.SetDescriptorSet( 5, m_SimulationDescriptorSet, true ); + m_PackedViewSpaceParticlePositions.SetDescriptorSet( 6, m_SimulationDescriptorSet, true ); + m_IndirectArgsBuffer.SetDescriptorSet( 7, m_SimulationDescriptorSet, true ); + m_AliveCountBuffer.SetDescriptorSet( 8, m_SimulationDescriptorSet, true ); + // depth buffer + SetDescriptorSet( m_pDevice->GetDevice(), 10, m_RandomTextureSRV, nullptr, m_SimulationDescriptorSet ); + + // Create pipelines + // + + DefineList defines = {}; + defines[ "API_VULKAN" ] = ""; + + m_ResetParticlesPipeline = CreatePipeline( "ParticleSimulation.hlsl", "CS_Reset", m_SimulationPipelineLayout, &defines ); + m_SimulationPipeline = CreatePipeline( "ParticleSimulation.hlsl", "CS_Simulate", m_SimulationPipelineLayout, &defines ); + m_EmitPipeline = CreatePipeline( "ParticleEmit.hlsl", "CS_Emit", m_SimulationPipelineLayout, &defines ); +} + + +void GPUParticleSystem::CreateRasterizedRenderingAssets( DynamicBufferRing& constantBufferRing ) +{ + // 0 - g_ParticleBufferA + // 1 - g_PackedViewSpacePositions + // 2 - g_NumParticlesBuffer + // 3 - g_SortedIndexBuffer + // 4 - g_ParticleTexture + // 5 - g_DepthTexture + // 6 - RenderingConstantBuffer + // 7 - g_samClampLinear + + std::vector layout_bindings( 8 ); + for ( uint32_t i = 0; i < layout_bindings.size(); i++ ) + { + layout_bindings[i].binding = i; + layout_bindings[i].descriptorCount = 1; + layout_bindings[i].pImmutableSamplers = nullptr; + } + + layout_bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + layout_bindings[0].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + + layout_bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + layout_bindings[1].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + + layout_bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + layout_bindings[2].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + + layout_bindings[3].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + layout_bindings[3].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + + layout_bindings[4].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + layout_bindings[4].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + + layout_bindings[5].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + layout_bindings[5].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + + layout_bindings[6].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; + layout_bindings[6].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT; + + layout_bindings[7].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER; + layout_bindings[7].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + layout_bindings[7].pImmutableSamplers = &m_samplers[ 1 ]; + + m_heaps->CreateDescriptorSetLayoutAndAllocDescriptorSet( &layout_bindings, &m_RasterizationDescriptorSetLayout, &m_RasterizationDescriptorSet ); + m_ParticleBufferA.SetDescriptorSet( 0, m_RasterizationDescriptorSet, false ); + m_PackedViewSpaceParticlePositions.SetDescriptorSet( 1, m_RasterizationDescriptorSet, false ); + m_AliveCountBuffer.SetDescriptorSet( 2, m_RasterizationDescriptorSet, false ); + m_AliveIndexBuffer.SetDescriptorSet( 3, m_RasterizationDescriptorSet, false ); + SetDescriptorSet( m_pDevice->GetDevice(), 4, m_AtlasSRV, nullptr, m_RasterizationDescriptorSet ); + // depth buffer + constantBufferRing.SetDescriptorSet( 6, sizeof( RenderingConstantBuffer ), m_RasterizationDescriptorSet ); + + // Create pipeline layout + // + + VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = {}; + pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + pipelineLayoutCreateInfo.setLayoutCount = 1; + pipelineLayoutCreateInfo.pSetLayouts = &m_RasterizationDescriptorSetLayout; + + VkResult res = vkCreatePipelineLayout( m_pDevice->GetDevice(), &pipelineLayoutCreateInfo, nullptr, &m_RasterizationPipelineLayout ); + assert(res == VK_SUCCESS); + + // input assembly state and layout + VkPipelineVertexInputStateCreateInfo vi = {}; + vi.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + + VkPipelineInputAssemblyStateCreateInfo ia; + ia.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + ia.pNext = NULL; + ia.flags = 0; + ia.primitiveRestartEnable = VK_FALSE; + ia.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + + // rasterizer state + VkPipelineRasterizationStateCreateInfo rs; + rs.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + rs.pNext = NULL; + rs.flags = 0; + rs.polygonMode = VK_POLYGON_MODE_FILL; + rs.cullMode = VK_CULL_MODE_NONE; + rs.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE; + rs.depthClampEnable = VK_FALSE; + rs.rasterizerDiscardEnable = VK_FALSE; + rs.depthBiasEnable = VK_FALSE; + rs.depthBiasConstantFactor = 0; + rs.depthBiasClamp = 0; + rs.depthBiasSlopeFactor = 0; + rs.lineWidth = 1.0f; + + VkPipelineColorBlendAttachmentState att_state[4] = {}; + att_state[0].colorWriteMask = 0xf; + att_state[0].blendEnable = VK_TRUE; + att_state[0].alphaBlendOp = VK_BLEND_OP_ADD; + att_state[0].colorBlendOp = VK_BLEND_OP_ADD; + att_state[0].srcColorBlendFactor = VK_BLEND_FACTOR_SRC_ALPHA; + att_state[0].dstColorBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA; + att_state[0].srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA; + att_state[0].dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO; + att_state[1].colorWriteMask = 0x0; + att_state[2].colorWriteMask = 0xf; + att_state[3].colorWriteMask = 0x0; + + // Color blend state + VkPipelineColorBlendStateCreateInfo cb = {}; + cb.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + cb.attachmentCount = _countof(att_state); + cb.pAttachments = att_state; + cb.logicOpEnable = VK_FALSE; + cb.logicOp = VK_LOGIC_OP_NO_OP; + cb.blendConstants[0] = 1.0f; + cb.blendConstants[1] = 1.0f; + cb.blendConstants[2] = 1.0f; + cb.blendConstants[3] = 1.0f; + + VkDynamicState dynamicStateEnables[] = { VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR }; + VkPipelineDynamicStateCreateInfo dynamicState = {}; + dynamicState.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; + dynamicState.pNext = NULL; + dynamicState.pDynamicStates = dynamicStateEnables; + dynamicState.dynamicStateCount = _countof( dynamicStateEnables ); + + // view port state + VkPipelineViewportStateCreateInfo vp = {}; + vp.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + vp.viewportCount = 1; + vp.scissorCount = 1; + + // depth stencil state + VkPipelineDepthStencilStateCreateInfo ds = {}; + ds.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + ds.depthTestEnable = VK_TRUE; + ds.depthWriteEnable = VK_FALSE; + ds.depthCompareOp = VK_COMPARE_OP_GREATER_OR_EQUAL; + ds.depthBoundsTestEnable = VK_FALSE; + ds.stencilTestEnable = VK_FALSE; + ds.back.failOp = VK_STENCIL_OP_KEEP; + ds.back.passOp = VK_STENCIL_OP_KEEP; + ds.back.compareOp = VK_COMPARE_OP_ALWAYS; + ds.back.compareMask = 0; + ds.back.reference = 0; + ds.back.depthFailOp = VK_STENCIL_OP_KEEP; + ds.back.writeMask = 0; + ds.minDepthBounds = 0; + ds.maxDepthBounds = 0; + ds.stencilTestEnable = VK_FALSE; + ds.front = ds.back; + + // multi sample state + VkPipelineMultisampleStateCreateInfo ms = {}; + ms.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + ms.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + + for ( int i = 0; i < NumStreakModes; i++ ) + { + for ( int j = 0; j < NumReactiveModes; j++ ) + { + att_state[2].colorWriteMask = 0x0; + + DefineList defines; + if ( i == StreaksOn ) + defines[ "STREAKS" ] = ""; + + if ( j == ReactiveOn ) + { + defines["REACTIVE"] = ""; + att_state[2].colorWriteMask = 0xf; + } + + // Compile shaders + // + VkPipelineShaderStageCreateInfo vertexShader = {}; + res = VKCompileFromFile(m_pDevice->GetDevice(), VK_SHADER_STAGE_VERTEX_BIT, "ParticleRender.hlsl", "VS_StructuredBuffer", "-T vs_6_0", &defines, &vertexShader ); + assert(res == VK_SUCCESS); + + VkPipelineShaderStageCreateInfo fragmentShader; + res = VKCompileFromFile(m_pDevice->GetDevice(), VK_SHADER_STAGE_FRAGMENT_BIT, "ParticleRender.hlsl", "PS_Billboard", "-T ps_6_0", &defines, &fragmentShader ); + assert(res == VK_SUCCESS); + + VkPipelineShaderStageCreateInfo shaderStages[] = { vertexShader, fragmentShader }; + + // Create pipeline + // + VkGraphicsPipelineCreateInfo pipeline = {}; + pipeline.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + pipeline.layout = m_RasterizationPipelineLayout; + pipeline.pVertexInputState = &vi; + pipeline.pInputAssemblyState = &ia; + pipeline.pRasterizationState = &rs; + pipeline.pMultisampleState = &ms; + pipeline.pColorBlendState = &cb; + pipeline.pDynamicState = &dynamicState; + pipeline.pViewportState = &vp; + pipeline.pDepthStencilState = &ds; + pipeline.pStages = shaderStages; + pipeline.stageCount = _countof( shaderStages ); + pipeline.renderPass = m_renderPass; + + res = vkCreateGraphicsPipelines( m_pDevice->GetDevice(), m_pDevice->GetPipelineCache(), 1, &pipeline, nullptr, &m_RasterizationPipelines[ i ][ j ] ); + assert(res == VK_SUCCESS); + } + } +} + + +void GPUParticleSystem::OnResizedSwapChain( int width, int height, Texture& depthBuffer, VkFramebuffer frameBuffer ) +{ + m_frameBuffer = frameBuffer; + m_ScreenWidth = width; + m_ScreenHeight = height; + m_InvScreenWidth = 1.0f / m_ScreenWidth; + m_InvScreenHeight = 1.0f / m_ScreenHeight; + + m_DepthBuffer = depthBuffer.Resource(); + depthBuffer.CreateSRV( &m_DepthBufferSRV ); + + SetDescriptorSetForDepth( m_pDevice->GetDevice(), 9, m_DepthBufferSRV, nullptr, m_SimulationDescriptorSet ); + SetDescriptorSetForDepth( m_pDevice->GetDevice(), 5, m_DepthBufferSRV, nullptr, m_RasterizationDescriptorSet ); +} + + +void GPUParticleSystem::OnReleasingSwapChain() +{ + if (m_DepthBufferSRV != nullptr) + { + vkDestroyImageView(m_pDevice->GetDevice(), m_DepthBufferSRV, nullptr); + m_DepthBufferSRV = {}; + } +} + + +void GPUParticleSystem::OnDestroyDevice() +{ + m_ParticleBufferA.OnDestroy(); + m_ParticleBufferB.OnDestroy(); + m_PackedViewSpaceParticlePositions.OnDestroy(); + m_MaxRadiusBuffer.OnDestroy(); + m_DeadListBuffer.OnDestroy(); + m_AliveDistanceBuffer.OnDestroy(); + m_AliveIndexBuffer.OnDestroy(); + m_DstAliveDistanceBuffer.OnDestroy(); + m_DstAliveIndexBuffer.OnDestroy(); + m_AliveCountBuffer.OnDestroy(); + vkDestroyImageView( m_pDevice->GetDevice(), m_RandomTextureSRV, nullptr ); + m_RandomTexture.OnDestroy(); + vkDestroyImageView( m_pDevice->GetDevice(), m_AtlasSRV, nullptr ); + m_Atlas.OnDestroy(); + m_IndirectArgsBuffer.OnDestroy(); + + vkDestroyDescriptorSetLayout( m_pDevice->GetDevice(), m_SimulationDescriptorSetLayout, nullptr ); + vkDestroyDescriptorSetLayout( m_pDevice->GetDevice(), m_RasterizationDescriptorSetLayout, nullptr ); + + vkDestroyPipeline( m_pDevice->GetDevice(), m_SimulationPipeline, nullptr ); + vkDestroyPipeline( m_pDevice->GetDevice(), m_ResetParticlesPipeline, nullptr ); + vkDestroyPipeline( m_pDevice->GetDevice(), m_EmitPipeline, nullptr ); + + for ( int i = 0; i < NumStreakModes; i++ ) + { + for ( int j = 0; j < NumReactiveModes; j++ ) + { + vkDestroyPipeline( m_pDevice->GetDevice(), m_RasterizationPipelines[ i ][ j ], nullptr ); + } + } + + vkDestroyPipelineLayout( m_pDevice->GetDevice(), m_SimulationPipelineLayout, nullptr ); + vkDestroyPipelineLayout( m_pDevice->GetDevice(), m_RasterizationPipelineLayout, nullptr ); + + m_SortLib.OnDestroy(); + + for ( int i = 0; i < _countof( m_samplers ); i++ ) + { + vkDestroySampler( m_pDevice->GetDevice(), m_samplers[ i ], nullptr ); + } + + m_ResetSystem = true; + m_pDevice = nullptr; +} + + +// Per-frame emission of particles into the GPU simulation +void GPUParticleSystem::Emit( VkCommandBuffer commandBuffer, DynamicBufferRing& constantBufferRing, uint32_t perFrameConstantOffset, int numEmitters, const EmitterParams* emitters ) +{ + vkCmdBindPipeline( commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, m_EmitPipeline ); + + // Run CS for each emitter + for ( int i = 0; i < numEmitters; i++ ) + { + const EmitterParams& emitter = emitters[ i ]; + + if ( emitter.m_NumToEmit > 0 ) + { + EmitterConstantBuffer* constants = nullptr; + VkDescriptorBufferInfo constantBuffer = {}; + constantBufferRing.AllocConstantBuffer( sizeof(*constants), (void**)&constants, &constantBuffer ); + constants->m_EmitterPosition = emitter.m_Position; + constants->m_EmitterVelocity = emitter.m_Velocity; + constants->m_MaxParticlesThisFrame = emitter.m_NumToEmit; + constants->m_ParticleLifeSpan = emitter.m_ParticleLifeSpan; + constants->m_StartSize = emitter.m_StartSize; + constants->m_EndSize = emitter.m_EndSize; + constants->m_PositionVariance = emitter.m_PositionVariance; + constants->m_VelocityVariance = emitter.m_VelocityVariance; + constants->m_Mass = emitter.m_Mass; + constants->m_Index = i; + constants->m_Streaks = emitter.m_Streaks ? 1 : 0; + constants->m_TextureIndex = emitter.m_TextureIndex; + + uint32_t uniformOffsets[] = { perFrameConstantOffset, (uint32_t)constantBuffer.offset }; + vkCmdBindDescriptorSets( commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, m_SimulationPipelineLayout, 0, 1, &m_SimulationDescriptorSet, _countof( uniformOffsets ), uniformOffsets ); + + // Dispatch enough thread groups to spawn the requested particles + int numThreadGroups = align( emitter.m_NumToEmit, 1024 ) / 1024; + vkCmdDispatch( commandBuffer, numThreadGroups, 1, 1 ); + } + } + + // RaW barriers + m_ParticleBufferA.PipelineBarrier( commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); + m_ParticleBufferB.PipelineBarrier( commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); + m_DeadListBuffer.PipelineBarrier( commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT ); +} + + +// Per-frame simulation step +void GPUParticleSystem::Simulate( VkCommandBuffer commandBuffer ) +{ + vkCmdBindPipeline( commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, m_SimulationPipeline ); + vkCmdDispatch( commandBuffer, align( g_maxParticles, 256 ) / 256, 1, 1 ); +} + +// Populate a texture with random numbers (used for the emission of particles) +void GPUParticleSystem::FillRandomTexture( UploadHeap& uploadHeap ) +{ + IMG_INFO header = {}; + header.width = 1024; + header.height = 1024; + header.depth = 1; + header.arraySize = 1; + header.mipMapCount = 1; + header.format = DXGI_FORMAT_R32G32B32A32_FLOAT; + header.bitCount = 128; + + float* values = new float[ header.width * header.height * 4 ]; + float* ptr = values; + for ( UINT i = 0; i < header.width * header.height; i++ ) + { + ptr[ 0 ] = RandomVariance( 0.0f, 1.0f ); + ptr[ 1 ] = RandomVariance( 0.0f, 1.0f ); + ptr[ 2 ] = RandomVariance( 0.0f, 1.0f ); + ptr[ 3 ] = RandomVariance( 0.0f, 1.0f ); + ptr += 4; + } + + m_RandomTexture.InitFromData( m_pDevice, uploadHeap, header, values, "RandomTexture" ); + m_RandomTexture.CreateSRV( &m_RandomTextureSRV ); + + delete[] values; +} diff --git a/src/GpuParticles/vk/ParallelSort.cpp b/src/GpuParticles/vk/ParallelSort.cpp new file mode 100644 index 0000000..9f6be1f --- /dev/null +++ b/src/GpuParticles/vk/ParallelSort.cpp @@ -0,0 +1,559 @@ +// ParallelSort.cpp +// +// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#define FFX_CPP +#include "ParallelSort.h" +#include "../../FFX-ParallelSort/FFX_ParallelSort.h" + +static const uint32_t NumKeys = { 400*1024 }; + +////////////////////////////////////////////////////////////////////////// +// Helper for Vulkan +VkBufferMemoryBarrier BufferTransition(VkBuffer buffer, VkAccessFlags before, VkAccessFlags after, uint32_t size) +{ + VkBufferMemoryBarrier bufferBarrier = {}; + bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + bufferBarrier.srcAccessMask = before; + bufferBarrier.dstAccessMask = after; + bufferBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + bufferBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + bufferBarrier.buffer = buffer; + bufferBarrier.size = size; + + return bufferBarrier; +} + + +void FFXParallelSort::BindConstantBuffer(VkDescriptorBufferInfo& GPUCB, VkDescriptorSet& DescriptorSet, uint32_t Binding/*=0*/, uint32_t Count/*=1*/) +{ + VkWriteDescriptorSet write_set = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; + write_set.pNext = nullptr; + write_set.dstSet = DescriptorSet; + write_set.dstBinding = Binding; + write_set.dstArrayElement = 0; + write_set.descriptorCount = Count; + write_set.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + write_set.pImageInfo = nullptr; + write_set.pBufferInfo = &GPUCB; + write_set.pTexelBufferView = nullptr; + vkUpdateDescriptorSets(m_pDevice->GetDevice(), 1, &write_set, 0, nullptr); +} + +void FFXParallelSort::BindUAVBuffer(VkBuffer* pBuffer, VkDescriptorSet& DescriptorSet, uint32_t Binding/*=0*/, uint32_t Count/*=1*/) +{ + std::vector bufferInfos; + for (uint32_t i = 0; i < Count; i++) + { + VkDescriptorBufferInfo bufferInfo; + bufferInfo.buffer = pBuffer[i]; + bufferInfo.offset = 0; + bufferInfo.range = VK_WHOLE_SIZE; + bufferInfos.push_back(bufferInfo); + } + + VkWriteDescriptorSet write_set = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; + write_set.pNext = nullptr; + write_set.dstSet = DescriptorSet; + write_set.dstBinding = Binding; + write_set.dstArrayElement = 0; + write_set.descriptorCount = Count; + write_set.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + write_set.pImageInfo = nullptr; + write_set.pBufferInfo = bufferInfos.data(); + write_set.pTexelBufferView = nullptr; + + vkUpdateDescriptorSets(m_pDevice->GetDevice(), 1, &write_set, 0, nullptr); +} + + +void FFXParallelSort::CompileRadixPipeline(const char* shaderFile, const DefineList* defines, const char* entryPoint, VkPipeline& pPipeline) +{ + std::string CompileFlags("-T cs_6_0"); +#ifdef _DEBUG + CompileFlags += " -Zi -Od"; +#endif // _DEBUG + + VkPipelineShaderStageCreateInfo stage_create_info = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO }; + + VkResult vkResult = VKCompileFromFile(m_pDevice->GetDevice(), VK_SHADER_STAGE_COMPUTE_BIT, shaderFile, entryPoint, "-T cs_6_0", defines, &stage_create_info); + stage_create_info.flags = 0; + assert(vkResult == VK_SUCCESS); + + VkComputePipelineCreateInfo create_info = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO }; + create_info.pNext = nullptr; + create_info.basePipelineHandle = VK_NULL_HANDLE; + create_info.basePipelineIndex = 0; + create_info.flags = 0; + create_info.layout = m_SortPipelineLayout; + create_info.stage = stage_create_info; + vkResult = vkCreateComputePipelines(m_pDevice->GetDevice(), VK_NULL_HANDLE, 1, &create_info, nullptr, &pPipeline); + assert(vkResult == VK_SUCCESS); +} + +void FFXParallelSort::OnCreate(Device* pDevice, ResourceViewHeaps* pResourceViewHeaps, DynamicBufferRing* pConstantBufferRing, UploadHeap* pUploadHeap, Buffer* elementCount, Buffer* listA, Buffer* listB, Buffer* listA2, Buffer* listB2) +{ + m_pDevice = pDevice; + m_pUploadHeap = pUploadHeap; + m_pResourceViewHeaps = pResourceViewHeaps; + m_pConstantBufferRing = pConstantBufferRing; + m_SrcKeyBuffer = listA; + m_SrcPayloadBuffer = listB; + m_DstKeyBuffer = listA2; + m_DstPayloadBuffer = listB2; + + m_MaxNumThreadgroups = 800; + + VkBufferCreateInfo bufferCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; + bufferCreateInfo.pNext = nullptr; + bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + bufferCreateInfo.usage = VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; // | VK_BUFFER_USAGE_TRANSFER_SRC_BIT; + + VmaAllocationCreateInfo allocCreateInfo = {}; + allocCreateInfo.memoryTypeBits = 0; + allocCreateInfo.pool = VK_NULL_HANDLE; + allocCreateInfo.preferredFlags = 0; + allocCreateInfo.requiredFlags = 0; + allocCreateInfo.usage = VMA_MEMORY_USAGE_UNKNOWN; + + // Allocate the scratch buffers needed for radix sort + FFX_ParallelSort_CalculateScratchResourceSize(NumKeys, m_ScratchBufferSize, m_ReducedScratchBufferSize); + + bufferCreateInfo.size = m_ScratchBufferSize; + allocCreateInfo.pUserData = "Scratch"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_FPSScratchBuffer, &m_FPSScratchBufferAllocation, nullptr)) + { + Trace("Failed to create buffer for Scratch"); + } + + bufferCreateInfo.size = m_ReducedScratchBufferSize; + allocCreateInfo.pUserData = "ReducedScratch"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_FPSReducedScratchBuffer, &m_FPSReducedScratchBufferAllocation, nullptr)) + { + Trace("Failed to create buffer for ReducedScratch"); + } + + // Allocate the buffers for indirect execution of the algorithm + + bufferCreateInfo.size = sizeof(uint32_t) * 3; + bufferCreateInfo.usage = VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + allocCreateInfo.pUserData = "IndirectCount_Scatter_DispatchArgs"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_IndirectCountScatterArgs, &m_IndirectCountScatterArgsAllocation, nullptr)) + { + Trace("Failed to create buffer for IndirectCount_Scatter_DispatchArgs"); + } + + allocCreateInfo.pUserData = "IndirectReduceScanArgs"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_IndirectReduceScanArgs, &m_IndirectReduceScanArgsAllocation, nullptr)) + { + Trace("Failed to create buffer for IndirectCount_Scatter_DispatchArgs"); + } + + bufferCreateInfo.size = sizeof(FFX_ParallelSortCB); + bufferCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + allocCreateInfo.pUserData = "IndirectConstantBuffer"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_IndirectConstantBuffer, &m_IndirectConstantBufferAllocation, nullptr)) + { + Trace("Failed to create buffer for IndirectConstantBuffer"); + } + + // Create Pipeline layout for Sort pass + { + // Create binding for Radix sort passes + VkDescriptorSetLayoutBinding layout_bindings_set_0[] = { + { 0, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr } // Constant buffer table + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_1[] = { + { 0, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr } // Constant buffer to setup indirect params (indirect) + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_InputOutputs[] = { + { 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // SrcBuffer (sort) + { 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // DstBuffer (sort) + { 2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // ScrPayload (sort only) + { 3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // DstPayload (sort only) + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_Scan[] = { + { 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // ScanSrc + { 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // ScanDst + { 2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // ScanScratch + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_Scratch[] = { + { 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // Scratch (sort only) + { 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // Scratch (reduced) + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_Indirect[] = { + { 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // NumKeys (indirect) + { 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // CBufferUAV (indirect) + { 2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // CountScatterArgs (indirect) + { 3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr } // ReduceScanArgs (indirect) + }; + + VkDescriptorSetLayoutCreateInfo descriptor_set_layout_create_info = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO }; + descriptor_set_layout_create_info.pNext = nullptr; + descriptor_set_layout_create_info.flags = 0; + descriptor_set_layout_create_info.pBindings = layout_bindings_set_0; + descriptor_set_layout_create_info.bindingCount = 1; + VkResult vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_SortDescriptorSetLayoutConstants); + assert(vkResult == VK_SUCCESS); + bool bDescriptorAlloc = true; + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutConstants, &m_SortDescriptorSetConstants[0]); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutConstants, &m_SortDescriptorSetConstants[1]); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutConstants, &m_SortDescriptorSetConstants[2]); + assert(bDescriptorAlloc == true); + + descriptor_set_layout_create_info.pBindings = layout_bindings_set_1; + descriptor_set_layout_create_info.bindingCount = 1; + vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_SortDescriptorSetLayoutConstantsIndirect); + assert(vkResult == VK_SUCCESS); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutConstantsIndirect, &m_SortDescriptorSetConstantsIndirect[0]); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutConstantsIndirect, &m_SortDescriptorSetConstantsIndirect[1]); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutConstantsIndirect, &m_SortDescriptorSetConstantsIndirect[2]); + assert(bDescriptorAlloc == true); + + descriptor_set_layout_create_info.pBindings = layout_bindings_set_InputOutputs; + descriptor_set_layout_create_info.bindingCount = 4; + vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_SortDescriptorSetLayoutInputOutputs); + assert(vkResult == VK_SUCCESS); + bDescriptorAlloc = m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutInputOutputs, &m_SortDescriptorSetInputOutput[0]); + assert(bDescriptorAlloc == true); + bDescriptorAlloc = m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutInputOutputs, &m_SortDescriptorSetInputOutput[1]); + assert(bDescriptorAlloc == true); + + descriptor_set_layout_create_info.pBindings = layout_bindings_set_Scan; + descriptor_set_layout_create_info.bindingCount = 3; + vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_SortDescriptorSetLayoutScan); + assert(vkResult == VK_SUCCESS); + bDescriptorAlloc = m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutScan, &m_SortDescriptorSetScanSets[0]); + assert(bDescriptorAlloc == true); + bDescriptorAlloc = m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutScan, &m_SortDescriptorSetScanSets[1]); + assert(bDescriptorAlloc == true); + + descriptor_set_layout_create_info.pBindings = layout_bindings_set_Scratch; + descriptor_set_layout_create_info.bindingCount = 2; + vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_SortDescriptorSetLayoutScratch); + assert(vkResult == VK_SUCCESS); + bDescriptorAlloc = m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutScratch, &m_SortDescriptorSetScratch); + assert(bDescriptorAlloc == true); + + descriptor_set_layout_create_info.pBindings = layout_bindings_set_Indirect; + descriptor_set_layout_create_info.bindingCount = 4; + vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_SortDescriptorSetLayoutIndirect); + assert(vkResult == VK_SUCCESS); + bDescriptorAlloc = m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutIndirect, &m_SortDescriptorSetIndirect); + assert(bDescriptorAlloc == true); + + // Create constant range representing our static constant + VkPushConstantRange constant_range; + constant_range.stageFlags = VK_SHADER_STAGE_ALL; + constant_range.offset = 0; + constant_range.size = 4; + + // Create the pipeline layout (Root signature) + VkPipelineLayoutCreateInfo layout_create_info = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO }; + layout_create_info.pNext = nullptr; + layout_create_info.flags = 0; + layout_create_info.setLayoutCount = 6; + VkDescriptorSetLayout layouts[] = { m_SortDescriptorSetLayoutConstants, m_SortDescriptorSetLayoutConstantsIndirect, m_SortDescriptorSetLayoutInputOutputs, + m_SortDescriptorSetLayoutScan, m_SortDescriptorSetLayoutScratch, m_SortDescriptorSetLayoutIndirect }; + layout_create_info.pSetLayouts = layouts; + layout_create_info.pushConstantRangeCount = 1; + layout_create_info.pPushConstantRanges = &constant_range; + VkResult bCreatePipelineLayout = vkCreatePipelineLayout(m_pDevice->GetDevice(), &layout_create_info, nullptr, &m_SortPipelineLayout); + assert(bCreatePipelineLayout == VK_SUCCESS); + } + + // Create Pipeline layout for Render of RadixBuffer info + { + // Create binding for Radix sort passes + VkDescriptorSetLayoutBinding layout_bindings_set_0[] = { + { 0, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr } // Constant buffer table + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_1[] = { + { 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr } // Sort Buffer + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_2[] = { + { 0, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 1, VK_SHADER_STAGE_ALL, nullptr } // ValidationTexture + }; + } + + ////////////////////////////////////////////////////////////////////////// + // Create pipelines for radix sort + { + // Create all of the necessary pipelines for Sort and Scan + DefineList defines; + defines[ "API_VULKAN" ] = ""; + + // SetupIndirectParams (indirect only) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_SetupIndirectParameters", m_FPSIndirectSetupParametersPipeline); + + // Radix count (sum table generation) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Count", m_FPSCountPipeline); + // Radix count reduce (sum table reduction for offset prescan) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_CountReduce", m_FPSCountReducePipeline); + // Radix scan (prefix scan) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Scan", m_FPSScanPipeline); + // Radix scan add (prefix scan + reduced prefix scan addition) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_ScanAdd", m_FPSScanAddPipeline); + // Radix scatter (key redistribution) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Scatter", m_FPSScatterPipeline); + // Radix scatter with payload (key and payload redistribution) + defines["kRS_ValueCopy"] = std::to_string(1); + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Scatter", m_FPSScatterPayloadPipeline); + } + + // Do binding setups + { + VkBuffer BufferMaps[4]; + + // Map inputs/outputs + BufferMaps[0] = m_SrcKeyBuffer->Resource(); + BufferMaps[1] = m_DstKeyBuffer->Resource(); + BufferMaps[2] = m_SrcPayloadBuffer->Resource(); + BufferMaps[3] = m_DstPayloadBuffer->Resource(); + BindUAVBuffer(BufferMaps, m_SortDescriptorSetInputOutput[0], 0, 4); + + BufferMaps[0] = m_DstKeyBuffer->Resource(); + BufferMaps[1] = m_SrcKeyBuffer->Resource(); + BufferMaps[2] = m_DstPayloadBuffer->Resource(); + BufferMaps[3] = m_SrcPayloadBuffer->Resource(); + BindUAVBuffer(BufferMaps, m_SortDescriptorSetInputOutput[1], 0, 4); + + // Map scan sets (reduced, scratch) + BufferMaps[0] = BufferMaps[1] = m_FPSReducedScratchBuffer; + BindUAVBuffer(BufferMaps, m_SortDescriptorSetScanSets[0], 0, 2); + + BufferMaps[0] = BufferMaps[1] = m_FPSScratchBuffer; + BufferMaps[2] = m_FPSReducedScratchBuffer; + BindUAVBuffer(BufferMaps, m_SortDescriptorSetScanSets[1], 0, 3); + + // Map Scratch areas (fixed) + BufferMaps[0] = m_FPSScratchBuffer; + BufferMaps[1] = m_FPSReducedScratchBuffer; + BindUAVBuffer(BufferMaps, m_SortDescriptorSetScratch, 0, 2); + + // Map indirect buffers + elementCount->SetDescriptorSet( 0, m_SortDescriptorSetIndirect, false ); + BufferMaps[0] = m_IndirectConstantBuffer; + BufferMaps[1] = m_IndirectCountScatterArgs; + BufferMaps[2] = m_IndirectReduceScanArgs; + BindUAVBuffer(BufferMaps, m_SortDescriptorSetIndirect, 1, 3); + } +} + +void FFXParallelSort::OnDestroy() +{ + // Release radix sort indirect resources + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_IndirectConstantBuffer, m_IndirectConstantBufferAllocation); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_IndirectCountScatterArgs, m_IndirectCountScatterArgsAllocation); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_IndirectReduceScanArgs, m_IndirectReduceScanArgsAllocation); + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSIndirectSetupParametersPipeline, nullptr); + + // Release radix sort algorithm resources + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_FPSScratchBuffer, m_FPSScratchBufferAllocation); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_FPSReducedScratchBuffer, m_FPSReducedScratchBufferAllocation); + + vkDestroyPipelineLayout(m_pDevice->GetDevice(), m_SortPipelineLayout, nullptr); + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_SortDescriptorSetLayoutConstants, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetConstants[0]); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetConstants[1]); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetConstants[2]); + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_SortDescriptorSetLayoutConstantsIndirect, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetConstantsIndirect[0]); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetConstantsIndirect[1]); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetConstantsIndirect[2]); + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_SortDescriptorSetLayoutInputOutputs, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetInputOutput[0]); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetInputOutput[1]); + + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_SortDescriptorSetLayoutScan, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetScanSets[0]); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetScanSets[1]); + + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_SortDescriptorSetLayoutScratch, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetScratch); + + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_SortDescriptorSetLayoutIndirect, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetIndirect); + + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSCountPipeline, nullptr); + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSCountReducePipeline, nullptr); + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSScanPipeline, nullptr); + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSScanAddPipeline, nullptr); + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSScatterPipeline, nullptr); + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSScatterPayloadPipeline, nullptr); +} + + +void FFXParallelSort::Draw(VkCommandBuffer commandList) +{ + // To control which descriptor set to use for updating data + static uint32_t frameCount = 0; + uint32_t frameConstants = (++frameCount) % 3; + + std::string markerText = "FFXParallelSortIndirect"; + SetPerfMarkerBegin(commandList, markerText.c_str()); + + // Buffers to ping-pong between when writing out sorted values + VkBuffer* ReadBufferInfo = &m_SrcKeyBuffer->Resource(); + VkBuffer* WriteBufferInfo(&m_DstKeyBuffer->Resource()); + VkBuffer* ReadPayloadBufferInfo(&m_SrcPayloadBuffer->Resource()), * WritePayloadBufferInfo(&m_DstPayloadBuffer->Resource()); + bool bHasPayload = true; + + // Setup barriers for the run + VkBufferMemoryBarrier Barriers[3]; + + // Fill in the constant buffer data structure (this will be done by a shader in the indirect version) + { + struct SetupIndirectCB + { + uint32_t MaxThreadGroups; + }; + SetupIndirectCB IndirectSetupCB; + IndirectSetupCB.MaxThreadGroups = m_MaxNumThreadgroups; + + // Copy the data into the constant buffer + VkDescriptorBufferInfo constantBuffer = m_pConstantBufferRing->AllocConstantBuffer(sizeof(SetupIndirectCB), (void*)&IndirectSetupCB); + BindConstantBuffer(constantBuffer, m_SortDescriptorSetConstantsIndirect[frameConstants]); + + // Dispatch + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 1, 1, &m_SortDescriptorSetConstantsIndirect[frameConstants], 0, nullptr); + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 5, 1, &m_SortDescriptorSetIndirect, 0, nullptr); + vkCmdBindPipeline(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_FPSIndirectSetupParametersPipeline); + vkCmdDispatch(commandList, 1, 1, 1); + + // When done, transition the args buffers to INDIRECT_ARGUMENT, and the constant buffer UAV to Constant buffer + VkBufferMemoryBarrier barriers[5]; + barriers[0] = BufferTransition(m_IndirectCountScatterArgs, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * 3); + barriers[1] = BufferTransition(m_IndirectReduceScanArgs, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * 3); + barriers[2] = BufferTransition(m_IndirectConstantBuffer, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, sizeof(FFX_ParallelSortCB)); + barriers[3] = BufferTransition(m_IndirectCountScatterArgs, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_INDIRECT_COMMAND_READ_BIT, sizeof(uint32_t) * 3); + barriers[4] = BufferTransition(m_IndirectReduceScanArgs, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_INDIRECT_COMMAND_READ_BIT, sizeof(uint32_t) * 3); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 5, barriers, 0, nullptr); + } + + // Bind the scratch descriptor sets + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 4, 1, &m_SortDescriptorSetScratch, 0, nullptr); + + // Copy the data into the constant buffer and bind + { + //constantBuffer = m_IndirectConstantBuffer.GetResource()->GetGPUVirtualAddress(); + VkDescriptorBufferInfo constantBuffer; + constantBuffer.buffer = m_IndirectConstantBuffer; + constantBuffer.offset = 0; + constantBuffer.range = VK_WHOLE_SIZE; + BindConstantBuffer(constantBuffer, m_SortDescriptorSetConstants[frameConstants]); + } + + // Bind constants + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 0, 1, &m_SortDescriptorSetConstants[frameConstants], 0, nullptr); + + // Perform Radix Sort (currently only support 32-bit key/payload sorting + uint32_t inputSet = 0; + for (uint32_t Shift = 0; Shift < 32u; Shift += FFX_PARALLELSORT_SORT_BITS_PER_PASS) + { + // Update the bit shift + vkCmdPushConstants(commandList, m_SortPipelineLayout, VK_SHADER_STAGE_ALL, 0, 4, &Shift); + + // Bind input/output for this pass + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 2, 1, &m_SortDescriptorSetInputOutput[inputSet], 0, nullptr); + + // Sort Count + { + vkCmdBindPipeline(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_FPSCountPipeline); + + vkCmdDispatchIndirect(commandList, m_IndirectCountScatterArgs, 0); + } + + // UAV barrier on the sum table + Barriers[0] = BufferTransition(m_FPSScratchBuffer, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, m_ScratchBufferSize); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 1, Barriers, 0, nullptr); + + // Sort Reduce + { + vkCmdBindPipeline(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_FPSCountReducePipeline); + + vkCmdDispatchIndirect(commandList, m_IndirectReduceScanArgs, 0); + + // UAV barrier on the reduced sum table + Barriers[0] = BufferTransition(m_FPSReducedScratchBuffer, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, m_ReducedScratchBufferSize); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 1, Barriers, 0, nullptr); + } + + // Sort Scan + { + // First do scan prefix of reduced values + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 3, 1, &m_SortDescriptorSetScanSets[0], 0, nullptr); + vkCmdBindPipeline(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_FPSScanPipeline); + + vkCmdDispatch(commandList, 1, 1, 1); + + // UAV barrier on the reduced sum table + Barriers[0] = BufferTransition(m_FPSReducedScratchBuffer, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, m_ReducedScratchBufferSize); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 1, Barriers, 0, nullptr); + + // Next do scan prefix on the histogram with partial sums that we just did + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 3, 1, &m_SortDescriptorSetScanSets[1], 0, nullptr); + + vkCmdBindPipeline(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_FPSScanAddPipeline); + vkCmdDispatchIndirect(commandList, m_IndirectReduceScanArgs, 0); + } + + // UAV barrier on the sum table + Barriers[0] = BufferTransition(m_FPSScratchBuffer, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, m_ScratchBufferSize); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 1, Barriers, 0, nullptr); + + // Sort Scatter + { + vkCmdBindPipeline(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, bHasPayload ? m_FPSScatterPayloadPipeline : m_FPSScatterPipeline); + + vkCmdDispatchIndirect(commandList, m_IndirectCountScatterArgs, 0); + } + + // Finish doing everything and barrier for the next pass + int numBarriers = 0; + Barriers[numBarriers++] = BufferTransition(*WriteBufferInfo, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * NumKeys); + if (bHasPayload) + Barriers[numBarriers++] = BufferTransition(*WritePayloadBufferInfo, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * NumKeys); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, numBarriers, Barriers, 0, nullptr); + + // Swap read/write sources + std::swap(ReadBufferInfo, WriteBufferInfo); + if (bHasPayload) + std::swap(ReadPayloadBufferInfo, WritePayloadBufferInfo); + inputSet = !inputSet; + } + + // When we are all done, transition indirect buffers back to UAV for the next frame (if doing indirect dispatch) + { + VkBufferMemoryBarrier barriers[3]; + barriers[0] = BufferTransition(m_IndirectConstantBuffer, VK_ACCESS_SHADER_READ_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(FFX_ParallelSortCB)); + barriers[1] = BufferTransition(m_IndirectCountScatterArgs, VK_ACCESS_INDIRECT_COMMAND_READ_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * 3); + barriers[2] = BufferTransition(m_IndirectReduceScanArgs, VK_ACCESS_INDIRECT_COMMAND_READ_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * 3); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 3, barriers, 0, nullptr); + } + + // Close out the perf capture + SetPerfMarkerEnd(commandList); +} diff --git a/src/GpuParticles/vk/ParallelSort.h b/src/GpuParticles/vk/ParallelSort.h new file mode 100644 index 0000000..37c0cd0 --- /dev/null +++ b/src/GpuParticles/vk/ParallelSort.h @@ -0,0 +1,101 @@ +// ParallelSort.h +// +// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once +#include "../vk/stdafx.h" +#include "BufferHelper.h" + + +struct ParallelSortRenderCB // If you change this, also change struct ParallelSortRenderCB in ParallelSortVerify.hlsl +{ + int32_t Width; + int32_t Height; + int32_t SortWidth; + int32_t SortHeight; +}; + + +class FFXParallelSort +{ +public: + void OnCreate(Device* pDevice, ResourceViewHeaps* pResourceViewHeaps, DynamicBufferRing* pConstantBufferRing, UploadHeap* pUploadHeap, Buffer* elementCount, Buffer* listA, Buffer* listB, Buffer* listA2, Buffer* listB2); + void OnDestroy(); + + void Draw(VkCommandBuffer commandList); + +private: + void CompileRadixPipeline(const char* shaderFile, const DefineList* defines, const char* entryPoint, VkPipeline& pPipeline); + void BindConstantBuffer(VkDescriptorBufferInfo& GPUCB, VkDescriptorSet& DescriptorSet, uint32_t Binding = 0, uint32_t Count = 1); + void BindUAVBuffer(VkBuffer* pBuffer, VkDescriptorSet& DescriptorSet, uint32_t Binding = 0, uint32_t Count = 1); + + + Device* m_pDevice = nullptr; + UploadHeap* m_pUploadHeap = nullptr; + ResourceViewHeaps* m_pResourceViewHeaps = nullptr; + DynamicBufferRing* m_pConstantBufferRing = nullptr; + uint32_t m_MaxNumThreadgroups = 800; + + uint32_t m_ScratchBufferSize; + uint32_t m_ReducedScratchBufferSize; + + Buffer* m_SrcKeyBuffer = nullptr; + Buffer* m_SrcPayloadBuffer = nullptr; + + Buffer* m_DstKeyBuffer = nullptr; + Buffer* m_DstPayloadBuffer = nullptr; + + VkBuffer m_FPSScratchBuffer; // Sort scratch buffer + VmaAllocation m_FPSScratchBufferAllocation; + + VkBuffer m_FPSReducedScratchBuffer; // Sort reduced scratch buffer + VmaAllocation m_FPSReducedScratchBufferAllocation; + + VkDescriptorSetLayout m_SortDescriptorSetLayoutConstants; + VkDescriptorSet m_SortDescriptorSetConstants[3]; + VkDescriptorSetLayout m_SortDescriptorSetLayoutConstantsIndirect; + VkDescriptorSet m_SortDescriptorSetConstantsIndirect[3]; + + VkDescriptorSetLayout m_SortDescriptorSetLayoutInputOutputs; + VkDescriptorSetLayout m_SortDescriptorSetLayoutScan; + VkDescriptorSetLayout m_SortDescriptorSetLayoutScratch; + VkDescriptorSetLayout m_SortDescriptorSetLayoutIndirect; + + VkDescriptorSet m_SortDescriptorSetInputOutput[2]; + VkDescriptorSet m_SortDescriptorSetScanSets[2]; + VkDescriptorSet m_SortDescriptorSetScratch; + VkDescriptorSet m_SortDescriptorSetIndirect; + VkPipelineLayout m_SortPipelineLayout; + + VkPipeline m_FPSCountPipeline; + VkPipeline m_FPSCountReducePipeline; + VkPipeline m_FPSScanPipeline; + VkPipeline m_FPSScanAddPipeline; + VkPipeline m_FPSScatterPipeline; + VkPipeline m_FPSScatterPayloadPipeline; + + // Resources for indirect execution of algorithm + VkBuffer m_IndirectConstantBuffer; // Buffer to hold radix sort constant buffer data for indirect dispatch + VmaAllocation m_IndirectConstantBufferAllocation; + VkBuffer m_IndirectCountScatterArgs; // Buffer to hold dispatch arguments used for Count/Scatter parts of the algorithm + VmaAllocation m_IndirectCountScatterArgsAllocation; + VkBuffer m_IndirectReduceScanArgs; // Buffer to hold dispatch arguments used for Reduce/Scan parts of the algorithm + VmaAllocation m_IndirectReduceScanArgsAllocation; + + VkPipeline m_FPSIndirectSetupParametersPipeline; +}; \ No newline at end of file diff --git a/src/VK/AnimatedTexture.cpp b/src/VK/AnimatedTexture.cpp new file mode 100644 index 0000000..e34e68c --- /dev/null +++ b/src/VK/AnimatedTexture.cpp @@ -0,0 +1,294 @@ +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + +#include "AnimatedTexture.h" + + +struct ConstantBuffer +{ + math::Matrix4 currentViewProj; + math::Matrix4 previousViewProj; + float jitterCompensation[ 2 ]; + float scrollFactor; + float rotationFactor; + int mode; + int pads[3]; +}; + + +void AnimatedTextures::OnCreate( Device& device, UploadHeap& uploadHeap, StaticBufferPool& bufferPool, VkRenderPass renderPass, ResourceViewHeaps& resourceViewHeaps, DynamicBufferRing& constantBufferRing ) +{ + m_pDevice = &device; + m_constantBufferRing = &constantBufferRing; + + VkSamplerCreateInfo sampler = {}; + sampler.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; + sampler.magFilter = VK_FILTER_LINEAR; + sampler.minFilter = VK_FILTER_LINEAR; + sampler.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR; + sampler.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT; + sampler.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT; + sampler.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT; + sampler.minLod = -1000; + sampler.maxLod = 1000; + sampler.maxAnisotropy = 16.0f; + VkResult res = vkCreateSampler( device.GetDevice(), &sampler, nullptr, &m_sampler); + assert(res == VK_SUCCESS); + + // Compile shaders + // + DefineList attributeDefines; + + VkPipelineShaderStageCreateInfo vertexShader; + res = VKCompileFromFile(m_pDevice->GetDevice(), VK_SHADER_STAGE_VERTEX_BIT, "AnimatedTexture.hlsl", "VSMain", "-T vs_6_0", &attributeDefines, &vertexShader); + assert(res == VK_SUCCESS); + + VkPipelineShaderStageCreateInfo fragmentShader; + res = VKCompileFromFile(m_pDevice->GetDevice(), VK_SHADER_STAGE_FRAGMENT_BIT, "AnimatedTexture.hlsl", "PSMain", "-T ps_6_0", &attributeDefines, &fragmentShader); + assert(res == VK_SUCCESS); + + std::vector shaderStages; + shaderStages.push_back(vertexShader); + shaderStages.push_back(fragmentShader); + + std::vector layoutBindings(3); + layoutBindings[0].binding = 0; + layoutBindings[0].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; + layoutBindings[0].descriptorCount = 1; + layoutBindings[0].stageFlags = VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT; + layoutBindings[0].pImmutableSamplers = nullptr; + + layoutBindings[1].binding = 1; + layoutBindings[1].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + layoutBindings[1].descriptorCount = 1; + layoutBindings[1].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + layoutBindings[1].pImmutableSamplers = nullptr; + + layoutBindings[2].binding = 2; + layoutBindings[2].descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER; + layoutBindings[2].descriptorCount = 1; + layoutBindings[2].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + layoutBindings[2].pImmutableSamplers = &m_sampler; + + for (int i = 0; i < _countof(m_descriptorSets);i++) + { + resourceViewHeaps.CreateDescriptorSetLayoutAndAllocDescriptorSet( &layoutBindings, &m_descriptorSetLayout, &m_descriptorSets[i] ); + constantBufferRing.SetDescriptorSet( 0, sizeof( ConstantBuffer ), m_descriptorSets[i] ); + } + + // Create pipeline layout + // + VkPipelineLayoutCreateInfo pPipelineLayoutCreateInfo = {}; + pPipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + pPipelineLayoutCreateInfo.setLayoutCount = 1; + pPipelineLayoutCreateInfo.pSetLayouts = &m_descriptorSetLayout; + + res = vkCreatePipelineLayout(m_pDevice->GetDevice(), &pPipelineLayoutCreateInfo, NULL, &m_pipelineLayout); + assert(res == VK_SUCCESS); + + VkPipelineVertexInputStateCreateInfo vi = {}; + vi.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + + VkPipelineInputAssemblyStateCreateInfo ia = {}; + ia.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + ia.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + + // rasterizer state + VkPipelineRasterizationStateCreateInfo rs = {}; + rs.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + rs.polygonMode = VK_POLYGON_MODE_FILL; + rs.cullMode = VK_CULL_MODE_NONE; + rs.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE; + rs.lineWidth = 1.0f; + + VkPipelineColorBlendAttachmentState att_state[4] = {}; + att_state[0].colorWriteMask = 0xf; + att_state[0].blendEnable = VK_FALSE; + att_state[0].alphaBlendOp = VK_BLEND_OP_ADD; + att_state[0].colorBlendOp = VK_BLEND_OP_ADD; + att_state[0].srcColorBlendFactor = VK_BLEND_FACTOR_SRC_ALPHA; + att_state[0].dstColorBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA; + att_state[0].srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE; + att_state[0].dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO; + + att_state[1].colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT; + att_state[2].colorWriteMask = 0x0; + att_state[3].colorWriteMask = VK_COLOR_COMPONENT_R_BIT; + + // Color blend state + + VkPipelineColorBlendStateCreateInfo cb = {}; + cb.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + cb.attachmentCount = _countof(att_state); + cb.pAttachments = att_state; + cb.logicOpEnable = VK_FALSE; + cb.logicOp = VK_LOGIC_OP_NO_OP; + cb.blendConstants[0] = 1.0f; + cb.blendConstants[1] = 1.0f; + cb.blendConstants[2] = 1.0f; + cb.blendConstants[3] = 1.0f; + + std::vector dynamicStateEnables = { + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, + VK_DYNAMIC_STATE_BLEND_CONSTANTS + }; + VkPipelineDynamicStateCreateInfo dynamicState = {}; + dynamicState.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; + dynamicState.pNext = NULL; + dynamicState.pDynamicStates = dynamicStateEnables.data(); + dynamicState.dynamicStateCount = (uint32_t)dynamicStateEnables.size(); + + // view port state + + VkPipelineViewportStateCreateInfo vp = {}; + vp.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + vp.viewportCount = 1; + vp.scissorCount = 1; + + // depth stencil state + + VkPipelineDepthStencilStateCreateInfo ds = {}; + ds.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + ds.depthTestEnable = VK_TRUE; + ds.depthWriteEnable = VK_TRUE; + ds.depthCompareOp = VK_COMPARE_OP_GREATER_OR_EQUAL; + ds.back.failOp = VK_STENCIL_OP_KEEP; + ds.back.passOp = VK_STENCIL_OP_KEEP; + ds.back.compareOp = VK_COMPARE_OP_ALWAYS; + ds.back.depthFailOp = VK_STENCIL_OP_KEEP; + ds.front = ds.back; + + // multi sample state + + VkPipelineMultisampleStateCreateInfo ms = {}; + ms.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + ms.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + + // create pipeline + + VkGraphicsPipelineCreateInfo pipeline = {}; + pipeline.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + pipeline.layout = m_pipelineLayout; + pipeline.pVertexInputState = &vi; + pipeline.pInputAssemblyState = &ia; + pipeline.pRasterizationState = &rs; + pipeline.pColorBlendState = &cb; + pipeline.pMultisampleState = &ms; + pipeline.pDynamicState = &dynamicState; + pipeline.pViewportState = &vp; + pipeline.pDepthStencilState = &ds; + pipeline.pStages = shaderStages.data(); + pipeline.stageCount = (uint32_t)shaderStages.size(); + pipeline.renderPass = renderPass; + pipeline.subpass = 0; + + res = vkCreateGraphicsPipelines(m_pDevice->GetDevice(), device.GetPipelineCache(), 1, &pipeline, NULL, &m_pipelines[0]); + assert(res == VK_SUCCESS); + SetResourceName(m_pDevice->GetDevice(), VK_OBJECT_TYPE_PIPELINE, (uint64_t)m_pipelines[0], "AT pipeline with comp"); + + att_state[3].colorWriteMask = 0; + res = vkCreateGraphicsPipelines(m_pDevice->GetDevice(), device.GetPipelineCache(), 1, &pipeline, NULL, &m_pipelines[1]); + assert(res == VK_SUCCESS); + SetResourceName(m_pDevice->GetDevice(), VK_OBJECT_TYPE_PIPELINE, (uint64_t)m_pipelines[1], "AT pipeline no comp"); + + UINT indices[6] = { 0, 1, 2, 2, 1, 3 }; + bufferPool.AllocBuffer( _countof( indices ), sizeof( UINT ), indices, &m_indexBuffer ); + + m_textures[0].InitFromFile( &device, &uploadHeap, "..\\media\\lion.jpg", true ); + m_textures[1].InitFromFile( &device, &uploadHeap, "..\\media\\checkerboard.dds", true ); + m_textures[2].InitFromFile( &device, &uploadHeap, "..\\media\\composition_text.dds", true ); + + for ( int i = 0; i < _countof( m_textures ); i++ ) + { + m_textures[ i ].CreateSRV( &m_textureSRVs[i] ); + SetDescriptorSet( m_pDevice->GetDevice(), 1, m_textureSRVs[i], nullptr, m_descriptorSets[i] ); + } +} + + +void AnimatedTextures::OnDestroy() +{ + vkDestroySampler(m_pDevice->GetDevice(), m_sampler, nullptr); + m_sampler = VK_NULL_HANDLE; + + for ( int i = 0; i < _countof( m_textures ); i++ ) + { + vkDestroyImageView(m_pDevice->GetDevice(), m_textureSRVs[i], nullptr); + m_textureSRVs[i] = VK_NULL_HANDLE; + + m_textures[i].OnDestroy(); + } + + for ( int i = 0; i < _countof( m_pipelines ); i++ ) + { + vkDestroyPipeline( m_pDevice->GetDevice(), m_pipelines[i], nullptr ); + m_pipelines[i] = VK_NULL_HANDLE; + } + + vkDestroyPipelineLayout( m_pDevice->GetDevice(), m_pipelineLayout, nullptr ); + m_pipelineLayout = VK_NULL_HANDLE; + + vkDestroyDescriptorSetLayout( m_pDevice->GetDevice(), m_descriptorSetLayout, nullptr ); + m_descriptorSetLayout = VK_NULL_HANDLE; + + m_pDevice = nullptr; +} + + +void AnimatedTextures::Render( VkCommandBuffer commandList, float frameTime, float speed, bool compositionMask, const Camera& camera ) +{ + m_scrollFactor += frameTime * 1.0f * speed; + m_rotationFactor += frameTime * 2.0f * speed; + m_flipTimer += frameTime * 1.0f; + + if ( m_scrollFactor > 10.0f ) + m_scrollFactor -= 10.0f; + + const float twoPI = 6.283185307179586476925286766559f; + + if ( m_rotationFactor > twoPI ) + m_rotationFactor -= twoPI; + + int textureIndex = min( (int)floorf( m_flipTimer * 0.33333f ), _countof( m_textures ) - 1 ); + if ( m_flipTimer > 9.0f ) + m_flipTimer = 0.0f; + + VkDescriptorBufferInfo cb = {}; + ConstantBuffer* constantBuffer = nullptr; + m_constantBufferRing->AllocConstantBuffer( sizeof(*constantBuffer), (void**)&constantBuffer, &cb ); + + constantBuffer->currentViewProj = camera.GetProjection() * camera.GetView(); + constantBuffer->previousViewProj = camera.GetPrevProjection() * camera.GetPrevView(); + + constantBuffer->jitterCompensation[0] = camera.GetPrevProjection().getCol2().getX() - camera.GetProjection().getCol2().getX(); + constantBuffer->jitterCompensation[1] = camera.GetPrevProjection().getCol2().getY() - camera.GetProjection().getCol2().getY(); + constantBuffer->scrollFactor = m_scrollFactor; + constantBuffer->rotationFactor = m_rotationFactor; + constantBuffer->mode = textureIndex; + + uint32_t uniformOffsets[] = { (uint32_t)cb.offset }; + vkCmdBindDescriptorSets( commandList, VK_PIPELINE_BIND_POINT_GRAPHICS, m_pipelineLayout, 0, 1, &m_descriptorSets[textureIndex], _countof( uniformOffsets ), uniformOffsets ); + vkCmdBindPipeline( commandList, VK_PIPELINE_BIND_POINT_GRAPHICS, m_pipelines[compositionMask ? 0 : 1] ); + vkCmdBindIndexBuffer( commandList, m_indexBuffer.buffer, m_indexBuffer.offset, VK_INDEX_TYPE_UINT32 ); + vkCmdDrawIndexed( commandList, 6, 2, 0, 0, 0 ); +} diff --git a/src/VK/AnimatedTexture.h b/src/VK/AnimatedTexture.h new file mode 100644 index 0000000..a4f5e18 --- /dev/null +++ b/src/VK/AnimatedTexture.h @@ -0,0 +1,57 @@ +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + +#pragma once +#include "stdafx.h" + + +class AnimatedTextures +{ +public: + + AnimatedTextures() {} + virtual ~AnimatedTextures() {} + + void OnCreate( Device& device, UploadHeap& uploadHeap, StaticBufferPool& bufferPool, VkRenderPass renderPass, ResourceViewHeaps& resourceViewHeaps, DynamicBufferRing& constantBufferRing ); + void OnDestroy(); + + void Render( VkCommandBuffer commandList, float frameTime, float speed, bool compositionMask, const Camera& camera ); + +private: + + Device* m_pDevice = nullptr; + DynamicBufferRing* m_constantBufferRing = nullptr; + + VkDescriptorSetLayout m_descriptorSetLayout = VK_NULL_HANDLE; + VkDescriptorSet m_descriptorSets[3] = {}; + VkPipelineLayout m_pipelineLayout = VK_NULL_HANDLE; + VkPipeline m_pipelines[2] = {}; + VkDescriptorBufferInfo m_indexBuffer = {}; + + Texture m_textures[3] = {}; + VkImageView m_textureSRVs[3] = {}; + VkSampler m_sampler = VK_NULL_HANDLE; + + float m_scrollFactor = 0.0f; + float m_rotationFactor = 0.0f; + float m_flipTimer = 0.0f; +}; \ No newline at end of file diff --git a/src/VK/AnimatedTexture.hlsl b/src/VK/AnimatedTexture.hlsl new file mode 100644 index 0000000..2dfada8 --- /dev/null +++ b/src/VK/AnimatedTexture.hlsl @@ -0,0 +1,128 @@ +// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +[[vk::binding( 0, 0 )]] cbuffer cb : register(b0) +{ + matrix g_CurrentViewProjection; + matrix g_PreviousViewProjection; + float2 g_CameraJitterCompensation; + float g_ScrollFactor; + float g_RotationFactor; + int g_Mode; + int pad0; + int pad1; + int pad2; +} + + +[[vk::binding( 1, 0 )]] Texture2D g_Texture : register(t0); +[[vk::binding( 2, 0 )]] SamplerState g_Sampler : register(s0); + +struct VERTEX_OUT +{ + float4 CurrentPosition : TEXCOORD0; + float4 PreviousPosition : TEXCOORD1; + float3 TexCoord : TEXCOORD2; + float4 Position : SV_POSITION; +}; + + +VERTEX_OUT VSMain( uint vertexId : SV_VertexID, uint instanceId : SV_InstanceID ) +{ + VERTEX_OUT output = (VERTEX_OUT)0; + + const float2 offsets[ 4 ] = + { + float2( -1, 1 ), + float2( 1, 1 ), + float2( -1, -1 ), + float2( 1, -1 ), + }; + + float2 offset = offsets[ vertexId ]; + float2 uv = (offset+1)*float2( instanceId == 0 ? -0.5 : 0.5, -0.5 ); + + float4 worldPos = float4( offsets[ vertexId ], 0.0, 1.0 ); + + worldPos.xyz += instanceId == 0 ? float3( -13, 1.5, 2 ) : float3( -13, 1.5, -2 ); + + output.CurrentPosition = mul( g_CurrentViewProjection, worldPos ); + output.PreviousPosition = mul( g_PreviousViewProjection, worldPos ); + + output.Position = output.CurrentPosition; + + output.TexCoord.xy = uv; + output.TexCoord.z = instanceId; + + return output; +} + +struct Output +{ + float4 finalColor : SV_TARGET0; + float2 motionVectors : SV_TARGET1; + float upscaleReactive : SV_TARGET2; + float upscaleTransparencyAndComposition : SV_TARGET3; +}; + + +float4 TextureLookup( int billboardIndex, float2 uv0 ) +{ + float4 color = 1; + + if ( billboardIndex == 0 || g_Mode == 2 ) + { + // Scrolling + float2 uv = uv0; + if ( g_Mode == 2 ) + uv += float2( -g_ScrollFactor, 0.0 ); + else + uv += float2( -g_ScrollFactor, 0.5*g_ScrollFactor ); + + color.rgb = g_Texture.SampleLevel( g_Sampler, uv, 0 ).rgb; + } + else if ( billboardIndex == 1 ) + { + // Rotated UVs + float s, c; + sincos( g_RotationFactor, s, c ); + float2x2 rotation = { float2( c, s ), float2( -s, c ) }; + + float2 rotatedUV = mul( rotation, uv0-float2( 0.5, -0.5) ); + color.rgb = g_Texture.SampleLevel( g_Sampler, rotatedUV, 0 ).rgb; + } + + return color; +} + + +Output PSMain( VERTEX_OUT input ) +{ + Output output = (Output)0; + + output.finalColor = TextureLookup( (int)input.TexCoord.z, input.TexCoord.xy ); + + output.motionVectors = (input.PreviousPosition.xy / input.PreviousPosition.w) - (input.CurrentPosition.xy / input.CurrentPosition.w) + g_CameraJitterCompensation; + output.motionVectors *= float2(0.5f, -0.5f); + + output.upscaleReactive = 0; // Nothing to write to the reactice mask. Color writes are off on this target anyway. + output.upscaleTransparencyAndComposition = 1; // Write a value into here to indicate the depth and motion vectors are as expected for a static object, but the surface contents are changing. + + return output; +} \ No newline at end of file diff --git a/src/VK/CMakeLists.txt b/src/VK/CMakeLists.txt index 720e3b8..a8d8b60 100644 --- a/src/VK/CMakeLists.txt +++ b/src/VK/CMakeLists.txt @@ -38,6 +38,15 @@ set(sources stdafx.h UI.cpp UI.h + ../GpuParticles/ParticleHelpers.h + ../GpuParticles/ParticleSystem.h + ../GpuParticles/ParticleSystemInternal.h + ../GpuParticles/vk/BufferHelper.h + ../GpuParticles/vk/GPUParticleSystem.cpp + ../GpuParticles/vk/ParallelSort.h + ../GpuParticles/vk/ParallelSort.cpp + AnimatedTexture.cpp + AnimatedTexture.h dpiawarescaling.manifest) set(fsr1_shaders_src @@ -94,7 +103,20 @@ set(fsr2_shaders_src ${CMAKE_CURRENT_SOURCE_DIR}/../ffx-fsr2-api/shaders/ffx_fsr2_rcas_pass.glsl) set(particle_shaders_src + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ParticleStructs.h + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ParticleHelpers.h + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/fp16util.h + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ParallelSortCS.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ParticleEmit.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ParticleRender.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ParticleSimulation.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/ShaderConstants.h + ${CMAKE_CURRENT_SOURCE_DIR}/../GpuParticleShaders/SimulationBindings.h + ${CMAKE_CURRENT_SOURCE_DIR}/../ffx-parallelsort/FFX_ParallelSort.h) + +set(sample_shaders_src ${CMAKE_CURRENT_SOURCE_DIR}/GPUFrameRateLimiter.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/AnimatedTexture.hlsl ${CMAKE_CURRENT_SOURCE_DIR}/DebugBlit.hlsl ${CMAKE_CURRENT_SOURCE_DIR}/UpscaleSpatial.hlsl ${CMAKE_CURRENT_SOURCE_DIR}/FSRPass.hlsl) @@ -104,12 +126,14 @@ set(APP_ICON_GPUOPEN "${CMAKE_CURRENT_SOURCE_DIR}/../common/GpuOpenIcon.rc") source_group("sources" FILES ${sources}) source_group("spatial_shaders" FILES ${fsr1_shaders_src}) source_group("fsr2_shaders" FILES ${fsr2_shaders_src}) +source_group("particle_shaders" FILES ${particle_shaders_src}) source_group("sample_shaders" FILES ${sample_shaders_src}) copyCommand("${spd_shaders_src}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibVK) copyCommand("${fsr1_shaders_src}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibVK) copyCommand("${fsr2_shaders_src}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibVK) copyCommand("${particle_shaders_src}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibVK) +copyCommand("${sample_shaders_src}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibVK) add_executable(FSR2_Sample_VK WIN32 ${sources} ${fsr2_src} ${sample_shaders_src} ${fsr1_shaders_src} ${fsr2_shaders_src} ${particle_shaders_src} ${spd_shaders_src} ${common} ${APP_ICON_GPUOPEN}) target_compile_definitions(FSR2_Sample_VK PRIVATE $<$:FSR2_DEBUG_SHADERS=1>) diff --git a/src/VK/FSR2Sample.cpp b/src/VK/FSR2Sample.cpp index ec2dc34..2491710 100644 --- a/src/VK/FSR2Sample.cpp +++ b/src/VK/FSR2Sample.cpp @@ -51,7 +51,6 @@ void FSR2Sample::OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* // set some default values *pWidth = 1920; *pHeight = 1080; - m_activeScene = 0; //load the first one by default m_VsyncEnabled = false; m_bIsBenchmarking = false; m_fontSize = 13.f; // default value overridden by a json file if available @@ -66,7 +65,7 @@ void FSR2Sample::OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* *pWidth = jData.value("width", *pWidth); *pHeight = jData.value("height", *pHeight); m_fullscreenMode = jData.value("presentationMode", m_fullscreenMode); - m_activeScene = jData.value("activeScene", m_activeScene); + m_UIState.m_activeScene = jData.value("activeScene", m_UIState.m_activeScene); m_activeCamera = jData.value("activeCamera", m_activeCamera); m_isCpuValidationLayerEnabled = jData.value("CpuValidationLayerEnabled", m_isCpuValidationLayerEnabled); m_isGpuValidationLayerEnabled = jData.value("GpuValidationLayerEnabled", m_isGpuValidationLayerEnabled); @@ -791,7 +790,7 @@ int WINAPI WinMain(HINSTANCE hInstance, LPSTR lpCmdLine, int nCmdShow) { - LPCSTR Name = "FidelityFX Super Resolution 2.0"; + LPCSTR Name = "FidelityFX Super Resolution 2.1"; // create new DX sample return RunFramework(hInstance, lpCmdLine, nCmdShow, new FSR2Sample(Name)); diff --git a/src/VK/FSR2Sample.h b/src/VK/FSR2Sample.h index 3f796ee..141126a 100644 --- a/src/VK/FSR2Sample.h +++ b/src/VK/FSR2Sample.h @@ -77,7 +77,6 @@ private: // json config file json m_jsonConfigFile; std::vector m_sceneNames; - int m_activeScene; int m_activeCamera; bool m_bPlay; diff --git a/src/VK/Renderer.cpp b/src/VK/Renderer.cpp index 7e64ba2..8a46472 100644 --- a/src/VK/Renderer.cpp +++ b/src/VK/Renderer.cpp @@ -48,12 +48,12 @@ void Renderer::OnCreate(Device *pDevice, SwapChain *pSwapChain, float FontSize, m_ConstantBufferRing.OnCreate(pDevice, backBufferCount, constantBuffersMemSize, "Uniforms"); // Create a 'static' pool for vertices and indices - const uint32_t staticGeometryMemSize = (1024) * 1024 * 1024; + const uint32_t staticGeometryMemSize = (5 * 128) * 1024 * 1024; m_VidMemBufferPool.OnCreate(pDevice, staticGeometryMemSize, true, "StaticGeom"); // Create a 'static' pool for vertices and indices in system memory - const uint32_t systemGeometryMemSize = 32 * 1024; - m_SysMemBufferPool.OnCreate(pDevice, systemGeometryMemSize, false, "PostProcGeom"); + //const uint32_t systemGeometryMemSize = 16 * 1024; + // m_SysMemBufferPool.OnCreate(pDevice, systemGeometryMemSize, false, "PostProcGeom"); // initialize the GPU time stamps module m_GPUTimer.OnCreate(pDevice, backBufferCount); @@ -82,16 +82,17 @@ void Renderer::OnCreate(Device *pDevice, SwapChain *pSwapChain, float FontSize, if (bInvertedDepth) fullGBuffer |= GBUFFER_INVERTED_DEPTH; bool bClear = true; - m_RenderPassFullGBufferWithClear.OnCreate(&m_GBuffer, fullGBuffer, bClear,"m_RenderPassFullGBufferWithClear"); + m_RenderPassFullGBufferWithClear.OnCreate(&m_GBuffer, fullGBuffer, bClear, "m_RenderPassFullGBufferWithClear"); m_RenderPassFullGBuffer.OnCreate(&m_GBuffer, fullGBuffer, !bClear, "m_RenderPassFullGBuffer"); m_RenderPassJustDepthAndHdr.OnCreate(&m_GBuffer, GBUFFER_DEPTH | GBUFFER_FORWARD, !bClear, "m_RenderPassJustDepthAndHdr"); + m_RenderPassFullGBufferNoDepthWrite.OnCreate(&m_GBuffer, fullGBuffer, !bClear, VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL, VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL, "m_RenderPassFullGBufferNoDepthWrite"); } // Create render pass shadow, will clear contents { VkAttachmentDescription depthAttachments; AttachClearBeforeUse(VK_FORMAT_D32_SFLOAT, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, &depthAttachments); - m_Render_pass_shadow = CreateRenderPassOptimal(m_pDevice->GetDevice(), 0, NULL, &depthAttachments); + m_Render_pass_shadow = CreateRenderPassOptimal(m_pDevice->GetDevice(), 0, NULL, &depthAttachments, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL); } m_SkyDome.OnCreate(pDevice, m_RenderPassJustDepthAndHdr.GetRenderPass(), &m_UploadHeap, VK_FORMAT_R16G16B16A16_SFLOAT, &m_ResourceViewHeaps, &m_ConstantBufferRing, &m_VidMemBufferPool, "..\\media\\cauldron-media\\envmaps\\papermill\\diffuse.dds", "..\\media\\cauldron-media\\envmaps\\papermill\\specular.dds", VK_SAMPLE_COUNT_1_BIT, m_bInvertedDepth); @@ -114,7 +115,14 @@ void Renderer::OnCreate(Device *pDevice, SwapChain *pSwapChain, float FontSize, m_VidMemBufferPool.UploadData(m_UploadHeap.GetCommandList()); m_UploadHeap.FlushAndFinish(); + m_pGPUParticleSystem = IParticleSystem::CreateGPUSystem("..\\media\\atlas.dds"); + m_pGPUParticleSystem->OnCreateDevice(*pDevice, m_UploadHeap, m_ResourceViewHeaps, m_VidMemBufferPool, m_ConstantBufferRing, m_RenderPassFullGBufferNoDepthWrite.GetRenderPass()); + m_GpuFrameRateLimiter.OnCreate(pDevice, &m_ConstantBufferRing, &m_ResourceViewHeaps); + + m_AnimatedTextures.OnCreate( *pDevice, m_UploadHeap, m_VidMemBufferPool, m_RenderPassFullGBufferWithClear.GetRenderPass(), m_ResourceViewHeaps, m_ConstantBufferRing ); + + ResetScene(); } //-------------------------------------------------------------------------------------- @@ -124,8 +132,13 @@ void Renderer::OnCreate(Device *pDevice, SwapChain *pSwapChain, float FontSize, //-------------------------------------------------------------------------------------- void Renderer::OnDestroy() { + m_AnimatedTextures.OnDestroy(); m_GpuFrameRateLimiter.OnDestroy(); + m_pGPUParticleSystem->OnDestroyDevice(); + delete m_pGPUParticleSystem; + m_pGPUParticleSystem = nullptr; + m_AsyncPool.Flush(); m_ImGUI.OnDestroy(); @@ -140,14 +153,18 @@ void Renderer::OnDestroy() m_SkyDomeProc.OnDestroy(); m_SkyDome.OnDestroy(); + m_RenderPassFullGBufferNoDepthWrite.OnDestroy(); m_RenderPassFullGBufferWithClear.OnDestroy(); m_RenderPassJustDepthAndHdr.OnDestroy(); m_RenderPassFullGBuffer.OnDestroy(); m_GBuffer.OnDestroy(); - m_pUpscaleContext->OnDestroy(); - delete m_pUpscaleContext; - m_pUpscaleContext = NULL; + if (m_pUpscaleContext) + { + m_pUpscaleContext->OnDestroy(); + delete m_pUpscaleContext; + m_pUpscaleContext = NULL; + } vkDestroyRenderPass(m_pDevice->GetDevice(), m_Render_pass_shadow, nullptr); @@ -179,6 +196,7 @@ void Renderer::OnCreateWindowSizeDependentResources(SwapChain *pSwapChain, UISta m_RenderPassFullGBufferWithClear.OnCreateWindowSizeDependentResources(m_Width, m_Height); m_RenderPassJustDepthAndHdr.OnCreateWindowSizeDependentResources(m_Width, m_Height); m_RenderPassFullGBuffer.OnCreateWindowSizeDependentResources(m_Width, m_Height); + m_RenderPassFullGBufferNoDepthWrite.OnCreateWindowSizeDependentResources(m_Width, m_Height); bool renderNative = (pState->m_nUpscaleType == UPSCALE_TYPE_NATIVE); bool hdr = (pSwapChain->GetDisplayMode() != DISPLAYMODE_SDR); @@ -207,6 +225,8 @@ void Renderer::OnCreateWindowSizeDependentResources(SwapChain *pSwapChain, UISta m_ImGUI.UpdatePipeline((pSwapChain->GetDisplayMode() == DISPLAYMODE_SDR) ? pSwapChain->GetRenderPass() : m_RenderPassDisplayOutput); + m_pGPUParticleSystem->OnResizedSwapChain(pState->renderWidth, pState->renderHeight, m_GBuffer.m_DepthBuffer, m_RenderPassFullGBufferNoDepthWrite.GetFramebuffer()); + // Lazy Upscale context generation: if ((m_pUpscaleContext == NULL) || (pState->m_nUpscaleType != m_pUpscaleContext->Type())) { @@ -223,7 +243,7 @@ void Renderer::OnCreateWindowSizeDependentResources(SwapChain *pSwapChain, UISta UpscaleContext::FfxUpscaleInitParams upscaleParams = { pState->m_nUpscaleType, m_bInvertedDepth, m_pDevice, pSwapChain->GetFormat(), &m_UploadHeap, backBufferCount }; m_pUpscaleContext = UpscaleContext::CreateUpscaleContext(upscaleParams); } - m_pUpscaleContext->OnCreateWindowSizeDependentResources(nullptr, m_displayOutputSRV, pState->renderWidth, pState->renderHeight, pState->displayWidth, pState->displayHeight, hdr); + m_pUpscaleContext->OnCreateWindowSizeDependentResources(nullptr, m_displayOutputSRV, pState->renderWidth, pState->renderHeight, pState->displayWidth, pState->displayHeight, true); } //-------------------------------------------------------------------------------------- @@ -233,6 +253,10 @@ void Renderer::OnCreateWindowSizeDependentResources(SwapChain *pSwapChain, UISta //-------------------------------------------------------------------------------------- void Renderer::OnDestroyWindowSizeDependentResources() { + m_pDevice->GPUFlush(); + + m_pGPUParticleSystem->OnReleasingSwapChain(); + vkDestroyImageView(m_pDevice->GetDevice(), m_OpaqueTextureSRV, 0); vkDestroyFramebuffer(m_pDevice->GetDevice(), m_FramebufferDisplayOutput, 0); vkDestroyImageView(m_pDevice->GetDevice(), m_displayOutputSRV, 0); @@ -253,6 +277,7 @@ void Renderer::OnDestroyWindowSizeDependentResources() m_RenderPassFullGBufferWithClear.OnDestroyWindowSizeDependentResources(); m_RenderPassJustDepthAndHdr.OnDestroyWindowSizeDependentResources(); m_RenderPassFullGBuffer.OnDestroyWindowSizeDependentResources(); + m_RenderPassFullGBufferNoDepthWrite.OnDestroyWindowSizeDependentResources(); m_GBuffer.OnDestroyWindowSizeDependentResources(); // destroy upscale context @@ -556,6 +581,7 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai m_pUpscaleContext->PreDraw(pState); + float fLightMod = 1.f; // Sets the perFrame data per_frame *pPerFrame = NULL; if (m_pGLTFTexturesAndBuffers) @@ -578,6 +604,27 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai m_pGLTFTexturesAndBuffers->SetSkinningMatricesForSkeletons(); } + { + m_state.flags = IParticleSystem::PF_Streaks | IParticleSystem::PF_DepthCull | IParticleSystem::PF_Sort; + m_state.flags |= pState->nReactiveMaskMode == REACTIVE_MASK_MODE_ON ? IParticleSystem::PF_Reactive : 0; + + const Camera& camera = pState->camera; + m_state.constantData.m_ViewProjection = camera.GetProjection() * camera.GetView(); + m_state.constantData.m_View = camera.GetView(); + m_state.constantData.m_ViewInv = math::inverse(camera.GetView()); + m_state.constantData.m_Projection = camera.GetProjection(); + m_state.constantData.m_ProjectionInv = math::inverse(camera.GetProjection()); + m_state.constantData.m_SunDirection = math::Vector4(0.7f, 0.7f, 0, 0); + m_state.constantData.m_SunColor = math::Vector4(0.8f, 0.8f, 0.7f, 0); + m_state.constantData.m_AmbientColor = math::Vector4(0.2f, 0.2f, 0.3f, 0); + + m_state.constantData.m_SunColor *= fLightMod; + m_state.constantData.m_AmbientColor *= fLightMod; + + m_state.constantData.m_FrameTime = pState->m_bPlayAnimations ? (0.001f * (float)pState->deltaTime) : 0.0f; + PopulateEmitters(pState->m_bPlayAnimations, pState->m_activeScene, 0.001f * (float)pState->deltaTime); + } + // Render all shadow maps if (m_GLTFDepth && pPerFrame != NULL) { @@ -646,6 +693,12 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai vkCmdSetViewport(cmdBuf1, 0, 1, &vpr); m_GLTFPBR->DrawBatchList(cmdBuf1, &opaque, bWireframe); + + if (pState->bRenderAnimatedTextures) + { + m_AnimatedTextures.Render(cmdBuf1, pState->m_bPlayAnimations ? (0.001f * (float)pState->deltaTime) : 0.0f, pState->m_fTextureAnimationSpeed, pState->bCompositionMask, Cam); + } + m_GPUTimer.GetTimeStamp(cmdBuf1, "PBR Opaque"); m_RenderPassFullGBufferWithClear.EndPass(cmdBuf1); @@ -683,7 +736,7 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai m_RenderPassJustDepthAndHdr.EndPass(cmdBuf1); } - if (pState->bUseFsr2AutoReactive|true) + if (pState->nReactiveMaskMode == REACTIVE_MASK_MODE_AUTOGEN) { // Copy resource before we render transparent stuff { @@ -764,13 +817,34 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai barriers[1].newLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; barriers[1].image = m_GBuffer.m_HDR.Resource(); - vkCmdPipelineBarrier(cmdBuf1, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, NULL, 0, NULL, 2, barriers); + vkCmdPipelineBarrier(cmdBuf1, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, NULL, 0, NULL, 2, barriers); } } // draw transparent geometry { - m_RenderPassFullGBuffer.BeginPass(cmdBuf1, currentRect); + if (pState->bRenderParticleSystem) + { + VkImageMemoryBarrier barrier = {}; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.oldLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + barrier.image = m_GBuffer.m_DepthBuffer.Resource(); + vkCmdPipelineBarrier(cmdBuf1, VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 0, nullptr, 1, &barrier); + + m_pGPUParticleSystem->Render(cmdBuf1, m_ConstantBufferRing, m_state.flags, m_state.emitters, m_state.numEmitters, m_state.constantData); + } + + m_RenderPassFullGBufferNoDepthWrite.BeginPass(cmdBuf1, currentRect); vkCmdSetScissor(cmdBuf1, 0, 1, &srr); vkCmdSetViewport(cmdBuf1, 0, 1, &vpr); @@ -779,11 +853,28 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai m_GLTFPBR->DrawBatchList(cmdBuf1, &transparent, bWireframe); m_GPUTimer.GetTimeStamp(cmdBuf1, "PBR Transparent"); - m_RenderPassFullGBuffer.EndPass(cmdBuf1); + m_RenderPassFullGBufferNoDepthWrite.EndPass(cmdBuf1); } // draw object's bounding boxes { + // Put the depth buffer back from the read only state to the write state + VkImageMemoryBarrier barrier = {}; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT; + barrier.dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + barrier.oldLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL; + barrier.newLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + barrier.image = m_GBuffer.m_DepthBuffer.Resource(); + vkCmdPipelineBarrier(cmdBuf1, VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT, 0, 0, nullptr, 0, nullptr, 1, &barrier); + m_RenderPassJustDepthAndHdr.BeginPass(cmdBuf1, currentRect); vkCmdSetScissor(cmdBuf1, 0, 1, &srr); @@ -850,7 +941,7 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai SetPerfMarkerEnd(cmdBuf1); // if FSR2 and auto reactive mask is enabled: generate reactive mask - if (pState->bUseFsr2AutoReactive) + if (pState->nReactiveMaskMode == REACTIVE_MASK_MODE_AUTOGEN) { VkImageMemoryBarrier barrier = {}; barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; @@ -940,8 +1031,8 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai barriers[1].image = m_GBuffer.m_MotionVectors.Resource(); barriers[2] = barrier; - barriers[2].srcAccessMask = pState->bUseFsr2AutoReactive ? VK_ACCESS_SHADER_WRITE_BIT : VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; - barriers[2].oldLayout = pState->bUseFsr2AutoReactive ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + barriers[2].srcAccessMask = pState->nReactiveMaskMode == REACTIVE_MASK_MODE_AUTOGEN ? VK_ACCESS_SHADER_WRITE_BIT : VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + barriers[2].oldLayout = pState->nReactiveMaskMode == REACTIVE_MASK_MODE_AUTOGEN ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; barriers[2].image = m_GBuffer.m_UpscaleReactive.Resource(); barriers[3] = barrier; @@ -964,7 +1055,7 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai VkPipelineStageFlags srcStageFlags = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; - if (pState->bUseFsr2AutoReactive) + if (pState->nReactiveMaskMode == REACTIVE_MASK_MODE_AUTOGEN) srcStageFlags |= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; vkCmdPipelineBarrier(cmdBuf1, srcStageFlags, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, NULL, 0, NULL, 6, barriers); @@ -1303,6 +1394,106 @@ void Renderer::OnRender(UIState* pState, const Camera& Cam, SwapChain* pSwapChai } } + +void Renderer::ResetScene() +{ + ZeroMemory(m_EmissionRates, sizeof(m_EmissionRates)); + + // Reset the particle system when the scene changes so no particles from the previous scene persist + m_pGPUParticleSystem->Reset(); +} + +void Renderer::PopulateEmitters(bool playAnimations, int activeScene, float frameTime) +{ + IParticleSystem::EmitterParams sparksEmitter = {}; + IParticleSystem::EmitterParams smokeEmitter = {}; + + sparksEmitter.m_NumToEmit = 0; + sparksEmitter.m_ParticleLifeSpan = 1.0f; + sparksEmitter.m_StartSize = 0.6f * 0.02f; + sparksEmitter.m_EndSize = 0.4f * 0.02f; + sparksEmitter.m_VelocityVariance = 1.5f; + sparksEmitter.m_Mass = 1.0f; + sparksEmitter.m_TextureIndex = 1; + sparksEmitter.m_Streaks = true; + + smokeEmitter.m_NumToEmit = 0; + smokeEmitter.m_ParticleLifeSpan = 50.0f; + smokeEmitter.m_StartSize = 0.4f; + smokeEmitter.m_EndSize = 1.0f; + smokeEmitter.m_VelocityVariance = 1.0f; + smokeEmitter.m_Mass = 0.0003f; + smokeEmitter.m_TextureIndex = 0; + smokeEmitter.m_Streaks = false; + + if ( activeScene == 0 ) // scene 0 = warehouse + { + m_state.numEmitters = 2; + m_state.emitters[0] = sparksEmitter; + m_state.emitters[1] = sparksEmitter; + + m_state.emitters[0].m_Position = math::Vector4(-4.15f, -1.85f, -3.8f, 1.0f); + m_state.emitters[0].m_PositionVariance = math::Vector4(0.1f, 0.0f, 0.0f, 1.0f); + m_state.emitters[0].m_Velocity = math::Vector4(0.0f, 0.08f, 0.8f, 1.0f); + m_EmissionRates[0].m_ParticlesPerSecond = 300.0f; + + m_state.emitters[1].m_Position = math::Vector4(-4.9f, -1.5f, -4.8f, 1.0f); + m_state.emitters[1].m_PositionVariance = math::Vector4(0.0f, 0.0f, 0.0f, 1.0f); + m_state.emitters[1].m_Velocity = math::Vector4(0.0f, 0.8f, -0.8f, 1.0f); + m_EmissionRates[1].m_ParticlesPerSecond = 400.0f; + + m_state.constantData.m_StartColor[0] = math::Vector4(10.0f, 10.0f, 2.0f, 0.9f); + m_state.constantData.m_EndColor[0] = math::Vector4(10.0f, 10.0f, 0.0f, 0.1f); + m_state.constantData.m_StartColor[1] = math::Vector4(10.0f, 10.0f, 2.0f, 0.9f); + m_state.constantData.m_EndColor[1] = math::Vector4(10.0f, 10.0f, 0.0f, 0.1f); + } + else if (activeScene == 1) // Sponza + { + m_state.numEmitters = 2; + m_state.emitters[0] = smokeEmitter; + m_state.emitters[1] = sparksEmitter; + + m_state.emitters[0].m_Position = math::Vector4(-13.0f, 0.0f, 1.4f, 1.0f); + m_state.emitters[0].m_PositionVariance = math::Vector4(0.1f, 0.0f, 0.1f, 1.0f); + m_state.emitters[0].m_Velocity = math::Vector4(0.0f, 0.2f, 0.0f, 1.0f); + m_EmissionRates[0].m_ParticlesPerSecond = 10.0f; + + m_state.emitters[1].m_Position = math::Vector4(-13.0f, 0.0f, -1.4f, 1.0f); + m_state.emitters[1].m_PositionVariance = math::Vector4(0.05f, 0.0f, 0.05f, 1.0f); + m_state.emitters[1].m_Velocity = math::Vector4(0.0f, 4.0f, 0.0f, 1.0f); + m_state.emitters[1].m_VelocityVariance = 0.5f; + m_state.emitters[1].m_StartSize = 0.02f; + m_state.emitters[1].m_EndSize = 0.02f; + m_state.emitters[1].m_Mass = 1.0f; + m_EmissionRates[1].m_ParticlesPerSecond = 500.0f; + + m_state.constantData.m_StartColor[0] = math::Vector4(0.3f, 0.3f, 0.3f, 0.4f); + m_state.constantData.m_EndColor[0] = math::Vector4(0.4f, 0.4f, 0.4f, 0.1f); + m_state.constantData.m_StartColor[1] = math::Vector4(10.0f, 10.0f, 10.0f, 0.9f); + m_state.constantData.m_EndColor[1] = math::Vector4(5.0f, 8.0f, 5.0f, 0.1f); + } + + // Update all our active emitters so we know how many whole numbers of particles to emit from each emitter this frame + for (int i = 0; i < m_state.numEmitters; i++) + { + m_state.constantData.m_EmitterLightingCenter[i] = m_state.emitters[ i ].m_Position; + + if (m_EmissionRates[i].m_ParticlesPerSecond > 0.0f) + { + m_EmissionRates[i].m_Accumulation += m_EmissionRates[i].m_ParticlesPerSecond * (playAnimations ? frameTime : 0.0f); + + if (m_EmissionRates[i].m_Accumulation > 1.0f) + { + float integerPart = 0.0f; + float fraction = modf(m_EmissionRates[i].m_Accumulation, &integerPart); + + m_state.emitters[i].m_NumToEmit = (int)integerPart; + m_EmissionRates[i].m_Accumulation = fraction; + } + } + } +} + void Renderer::BuildDevUI(UIState* pState) { if (m_pUpscaleContext) diff --git a/src/VK/Renderer.h b/src/VK/Renderer.h index 07faff1..5e6d39e 100644 --- a/src/VK/Renderer.h +++ b/src/VK/Renderer.h @@ -27,6 +27,10 @@ #include "PostProc/MagnifierPS.h" #include "UpscaleContext.h" #include "GPUFrameRateLimiter.h" +#include "AnimatedTexture.h" + +#include "../GpuParticles/ParticleSystem.h" +#include "../GpuParticleShaders/ShaderConstants.h" // We are queuing (backBufferCount + 0.5) frames, so we need to triple buffer the resources that get modified each frame static const int backBufferCount = 3; @@ -62,6 +66,25 @@ public: void BuildDevUI(UIState* pState); private: + + struct State + { + float frameTime = 0.0f; + int numEmitters = 0; + IParticleSystem::EmitterParams emitters[10] = {}; + int flags = 0; + IParticleSystem::ConstantData constantData = {}; + }; + + struct EmissionRate + { + float m_ParticlesPerSecond = 0.0f; // Number of particles to emit per second + float m_Accumulation = 0.0f; // Running total of how many particles to emit over elapsed time + }; + + void ResetScene(); + void PopulateEmitters(bool playAnimations, int activeScene, float frameTime); + Device *m_pDevice; uint32_t m_Width; @@ -97,6 +120,13 @@ private: MagnifierPS m_MagnifierPS; bool m_bMagResourceReInit = false; + // GPU Particle System + State m_state = {}; + IParticleSystem* m_pGPUParticleSystem = nullptr; + EmissionRate m_EmissionRates[NUM_EMITTERS] = {}; + + AnimatedTextures m_AnimatedTextures = {}; + // Upscale UpscaleContext* m_pUpscaleContext = nullptr; VkRenderPass m_RenderPassDisplayOutput; @@ -110,6 +140,7 @@ private: GBufferRenderPass m_RenderPassFullGBufferWithClear; GBufferRenderPass m_RenderPassJustDepthAndHdr; GBufferRenderPass m_RenderPassFullGBuffer; + GBufferRenderPass m_RenderPassFullGBufferNoDepthWrite; Texture m_OpaqueTexture; VkImageView m_OpaqueTextureSRV; diff --git a/src/VK/UI.cpp b/src/VK/UI.cpp index 4b47576..8c8dcfb 100644 --- a/src/VK/UI.cpp +++ b/src/VK/UI.cpp @@ -73,7 +73,7 @@ void FSR2Sample::BuildUI() // if we haven't initialized GLTFLoader yet, don't draw UI. if (m_pGltfLoader == nullptr) { - LoadScene(m_activeScene); + LoadScene(m_UIState.m_activeScene); return; } @@ -133,13 +133,13 @@ void FSR2Sample::BuildUI() ImGui::Checkbox("Camera Headbobbing", &m_UIState.m_bHeadBobbing); auto getterLambda = [](void* data, int idx, const char** out_str)->bool { *out_str = ((std::vector *)data)->at(idx).c_str(); return true; }; - if (ImGui::Combo("Model", &m_activeScene, getterLambda, &m_sceneNames, (int)m_sceneNames.size())) + if (ImGui::Combo("Model", &m_UIState.m_activeScene, getterLambda, &m_sceneNames, (int)m_sceneNames.size())) { - m_UIState.bRenderParticleSystem = (m_activeScene == 11); + m_UIState.bRenderAnimatedTextures = (m_UIState.m_activeScene == 1); // Note: // probably queueing this as an event and handling it at the end/beginning // of frame is a better idea rather than in the middle of drawing UI. - LoadScene(m_activeScene); + LoadScene(m_UIState.m_activeScene); //bail out as we need to reload everything ImGui::End(); @@ -188,7 +188,7 @@ void FSR2Sample::BuildUI() OnResize(true); } - if (m_UIState.m_nUpscaleType <= UPSCALE_TYPE_FSR_2_0) + if (m_UIState.m_nUpscaleType == UPSCALE_TYPE_FSR_2_0) { // adjust to match the combo box options int32_t upscaleQualityMode = m_nUpscaleMode - UPSCALE_QUALITY_MODE_QUALITY; @@ -214,20 +214,28 @@ void FSR2Sample::BuildUI() OnResize(); } - if (m_UIState.m_nUpscaleType == UPSCALE_TYPE_FSR_2_0) + + if (ImGui::Checkbox("Dynamic resolution", &m_UIState.bDynamicRes)) { - if (ImGui::Checkbox("Dynamic resolution", &m_UIState.bDynamicRes)) { - OnResize(); - } + OnResize(); } - else - m_UIState.bDynamicRes = false; + + const char* reactiveOptions[] = { "Disabled", "Manual Reactive Mask Generation", "Autogen FSR2 Helper Function" }; + ImGui::Combo("Reactive Mask mode", (int*)(&m_UIState.nReactiveMaskMode), reactiveOptions, _countof(reactiveOptions)); + + ImGui::Checkbox("Use Transparency and Composition Mask", &m_UIState.bCompositionMask); } else { m_UIState.mipBias = mipBias[UPSCALE_TYPE_NATIVE]; } + + if (m_UIState.m_nUpscaleType != UPSCALE_TYPE_FSR_2_0) + { + m_UIState.bDynamicRes = false; + } + ImGui::Checkbox("RCAS Sharpening", &m_UIState.bUseRcas); if (m_UIState.m_nUpscaleType == UPSCALE_TYPE_FSR_2_0) { @@ -318,7 +326,7 @@ void FSR2Sample::BuildUI() if (ImGui::CollapsingHeader("Presentation Mode", ImGuiTreeNodeFlags_DefaultOpen)) { - const char* fullscreenModes[] = { "Windowed", "BorderlessFullscreen", "ExclusiveFulscreen" }; + const char* fullscreenModes[] = { "Windowed", "BorderlessFullscreen", "ExclusiveFullscreen" }; if (ImGui::Combo("Fullscreen Mode", (int*)&m_fullscreenMode, fullscreenModes, _countof(fullscreenModes))) { if (m_previousFullscreenMode != m_fullscreenMode) @@ -633,4 +641,4 @@ bool UIState::DevOption(float* pFloatValue, const char* name, float fMin, float void UIState::Text(const char* text) { ImGui::Text(text); -} \ No newline at end of file +} diff --git a/src/VK/UI.h b/src/VK/UI.h index 844e18c..52be14f 100644 --- a/src/VK/UI.h +++ b/src/VK/UI.h @@ -55,12 +55,26 @@ typedef enum UpscaleQualityMode { UPSCALE_QUALITY_MODE_COUNT } UpscaleQualityMode; +typedef enum ReactiveMaskMode { + REACTIVE_MASK_MODE_OFF = 0, // Nothing written to the reactive mask + REACTIVE_MASK_MODE_ON = 1, // Particles written to the reactive mask + REACTIVE_MASK_MODE_AUTOGEN = 2, // The mask is auto generated using FSR2's helper function + + // add above this. + REACTIVE_MASK_MODE_COUNT +} ReactiveMaskMode; + struct UIState { Camera camera; bool m_bCameraInertia = false; bool m_bHeadBobbing = false; + bool m_bPlayAnimations = true; + float m_fTextureAnimationSpeed = 1.0f; + int m_activeScene = 0; + bool m_bAnimateSpotlight = false; + // // WINDOW MANAGEMENT // @@ -76,7 +90,9 @@ struct UIState bool bReset = false; - bool bRenderParticleSystem = false; + int nLightModulationMode = 0; + bool bRenderParticleSystem = true; + bool bRenderAnimatedTextures = true; bool bUseMagnifier; bool bLockMagnifierPosition; bool bLockMagnifierPositionHistory; @@ -108,15 +124,31 @@ struct UIState unsigned int closestVelocitySamplePattern = 0; // 5 samples float Feedback = 15.f / 16.f; - // FSR2 auto reactive - bool bUseFsr2AutoReactive = false; + // FSR2 reactive mask + ReactiveMaskMode nReactiveMaskMode = REACTIVE_MASK_MODE_ON; float fFsr2AutoReactiveScale = 1.f; - float fFsr2AutoReactiveThreshold = 0.01f; + float fFsr2AutoReactiveThreshold = 0.2f; + float fFsr2AutoReactiveBinaryValue = 0.9f; bool bFsr2AutoReactiveTonemap = true; bool bFsr2AutoReactiveInverseTonemap = false; bool bFsr2AutoReactiveThreshold = true; bool bFsr2AutoReactiveUseMax = true; + // FSR2 composition mask + bool bCompositionMask = true; + + // FSR2 + bool bUseDebugOut = false; + int nDebugBlitSurface = 6; // FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR + int nDebugOutMappingR = 0; + int nDebugOutMappingG = 1; + int nDebugOutMappingB = 2; + float v2DebugOutMappingR[2] = { 0.f, 1.f }; + float v2DebugOutMappingG[2] = { 0.f, 1.f }; + float v2DebugOutMappingB[2] = { 0.f, 1.f }; + + float v4DebugSliderValues[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + // // APP/SCENE CONTROLS // diff --git a/src/VK/UpscaleContext_FSR2_API.cpp b/src/VK/UpscaleContext_FSR2_API.cpp index f98b837..4805ee9 100644 --- a/src/VK/UpscaleContext_FSR2_API.cpp +++ b/src/VK/UpscaleContext_FSR2_API.cpp @@ -101,8 +101,11 @@ void UpscaleContext_FSR2_API::OnCreateWindowSizeDependentResources( initializationParameters.maxRenderSize.height = renderHeight; initializationParameters.displaySize.width = displayWidth; initializationParameters.displaySize.height = displayHeight; - initializationParameters.flags = FFX_FSR2_ENABLE_DEPTH_INVERTED - | FFX_FSR2_ENABLE_AUTO_EXPOSURE; + initializationParameters.flags = FFX_FSR2_ENABLE_AUTO_EXPOSURE; + + if (m_bInvertedDepth) { + initializationParameters.flags |= FFX_FSR2_ENABLE_DEPTH_INVERTED; + } if (hdr) { initializationParameters.flags |= FFX_FSR2_ENABLE_HIGH_DYNAMIC_RANGE; @@ -117,7 +120,13 @@ void UpscaleContext_FSR2_API::OnCreateWindowSizeDependentResources( void UpscaleContext_FSR2_API::OnDestroyWindowSizeDependentResources() { UpscaleContext::OnDestroyWindowSizeDependentResources(); - ffxFsr2ContextDestroy(&context); + // only destroy contexts which are live + if (initializationParameters.callbacks.scratchBuffer != nullptr) + { + ffxFsr2ContextDestroy(&context); + free(initializationParameters.callbacks.scratchBuffer); + initializationParameters.callbacks.scratchBuffer = nullptr; + } } void UpscaleContext_FSR2_API::BuildDevUI(UIState* pState) @@ -144,6 +153,7 @@ void UpscaleContext_FSR2_API::GenerateReactiveMask(VkCommandBuffer pCommandList, generateReactiveParameters.scale = pState->fFsr2AutoReactiveScale; generateReactiveParameters.cutoffThreshold = pState->fFsr2AutoReactiveThreshold; + generateReactiveParameters.binaryValue = pState->fFsr2AutoReactiveBinaryValue; generateReactiveParameters.flags = (pState->bFsr2AutoReactiveTonemap ? FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_TONEMAP : 0) | (pState->bFsr2AutoReactiveInverseTonemap ? FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_INVERSETONEMAP : 0) | (pState->bFsr2AutoReactiveThreshold ? FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_THRESHOLD : 0) | @@ -160,8 +170,26 @@ void UpscaleContext_FSR2_API::Draw(VkCommandBuffer commandBuffer, const FfxUpsca dispatchParameters.depth = ffxGetTextureResourceVK(&context, cameraSetup.depthbufferResource->Resource(), cameraSetup.depthbufferResourceView, cameraSetup.depthbufferResource->GetWidth(), cameraSetup.depthbufferResource->GetHeight(), cameraSetup.depthbufferResource->GetFormat(), L"FSR2_InputDepth"); dispatchParameters.motionVectors = ffxGetTextureResourceVK(&context, cameraSetup.motionvectorResource->Resource(), cameraSetup.motionvectorResourceView, cameraSetup.motionvectorResource->GetWidth(), cameraSetup.motionvectorResource->GetHeight(), cameraSetup.motionvectorResource->GetFormat(), L"FSR2_InputMotionVectors"); dispatchParameters.exposure = ffxGetTextureResourceVK(&context, nullptr, nullptr, 1, 1, VK_FORMAT_UNDEFINED, L"FSR2_InputExposure"); - dispatchParameters.reactive = ffxGetTextureResourceVK(&context, cameraSetup.reactiveMapResource->Resource(), cameraSetup.reactiveMapResourceView, cameraSetup.reactiveMapResource->GetWidth(), cameraSetup.reactiveMapResource->GetHeight(), cameraSetup.reactiveMapResource->GetFormat(), L"FSR2_InputReactiveMap"); - dispatchParameters.transparencyAndComposition = ffxGetTextureResourceVK(&context, cameraSetup.transparencyAndCompositionResource->Resource(), cameraSetup.transparencyAndCompositionResourceView, cameraSetup.transparencyAndCompositionResource->GetWidth(), cameraSetup.transparencyAndCompositionResource->GetHeight(), cameraSetup.transparencyAndCompositionResource->GetFormat(), L"FSR2_TransparencyAndCompositionMap"); + + if ((pState->nReactiveMaskMode == ReactiveMaskMode::REACTIVE_MASK_MODE_ON) + || (pState->nReactiveMaskMode == ReactiveMaskMode::REACTIVE_MASK_MODE_AUTOGEN)) + { + dispatchParameters.reactive = ffxGetTextureResourceVK(&context, cameraSetup.reactiveMapResource->Resource(), cameraSetup.reactiveMapResourceView, cameraSetup.reactiveMapResource->GetWidth(), cameraSetup.reactiveMapResource->GetHeight(), cameraSetup.reactiveMapResource->GetFormat(), L"FSR2_InputReactiveMap"); + } + else + { + dispatchParameters.reactive = ffxGetTextureResourceVK(&context, nullptr, nullptr, 1, 1, VK_FORMAT_UNDEFINED, L"FSR2_EmptyInputReactiveMap"); + } + + if (pState->bCompositionMask == true) + { + dispatchParameters.transparencyAndComposition = ffxGetTextureResourceVK(&context, cameraSetup.transparencyAndCompositionResource->Resource(), cameraSetup.transparencyAndCompositionResourceView, cameraSetup.transparencyAndCompositionResource->GetWidth(), cameraSetup.transparencyAndCompositionResource->GetHeight(), cameraSetup.transparencyAndCompositionResource->GetFormat(), L"FSR2_TransparencyAndCompositionMap"); + } + else + { + dispatchParameters.transparencyAndComposition = ffxGetTextureResourceVK(&context, nullptr, nullptr, 1, 1, VK_FORMAT_UNDEFINED, L"FSR2_EmptyTransparencyAndCompositionMap"); + } + dispatchParameters.output = ffxGetTextureResourceVK(&context, cameraSetup.resolvedColorResource->Resource(), cameraSetup.resolvedColorResourceView, cameraSetup.resolvedColorResource->GetWidth(), cameraSetup.resolvedColorResource->GetHeight(), cameraSetup.resolvedColorResource->GetFormat(), L"FSR2_OutputUpscaledColor", FFX_RESOURCE_STATE_UNORDERED_ACCESS); dispatchParameters.jitterOffset.x = m_JitterX; dispatchParameters.jitterOffset.y = m_JitterY; diff --git a/src/VK/stdafx.h b/src/VK/stdafx.h index 3b86a85..a714eb1 100644 --- a/src/VK/stdafx.h +++ b/src/VK/stdafx.h @@ -85,5 +85,5 @@ #include "Widgets/WireframeSphere.h" - +#define API_VULKAN using namespace CAULDRON_VK; diff --git a/src/ffx-fsr2-api/CMakeLists.txt b/src/ffx-fsr2-api/CMakeLists.txt index cb291be..7ef023c 100644 --- a/src/ffx-fsr2-api/CMakeLists.txt +++ b/src/ffx-fsr2-api/CMakeLists.txt @@ -20,7 +20,6 @@ # THE SOFTWARE. cmake_minimum_required(VERSION 3.15) -#set(CMAKE_CONFIGURATION_TYPES Debug Release) set(CMAKE_DEBUG_POSTFIX d) option (FFX_FSR2_API_DX12 "Build FSR 2.0 DX12 backend" ON) @@ -55,6 +54,9 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${CMAKE_HOME_DIRECTORY}/bin/ffx_fsr2_ add_compile_definitions(_UNICODE) add_compile_definitions(UNICODE) +#add_compile_definitions(FSR2_VERSION_MAJOR=0) +#add_compile_definitions(FSR2_VERSION_MINOR=1) +#add_compile_definitions(FSR2_VERSION_PATCH=0) if(FSR2_VS_VERSION STREQUAL 2015) message(NOTICE "Forcing the SDK path for VS 2015") @@ -65,10 +67,19 @@ set(FFX_SC_EXECUTABLE ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/sc/FidelityFX_SC.exe) set(FFX_SC_BASE_ARGS - -reflection -deps=gcc -DFFX_GPU=1 -DOPT_PRECOMPUTE_REACTIVE_MAX=1) + -reflection -deps=gcc -DFFX_GPU=1 + # Only reprojection is to do half for now + -DFFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF=0 + -DFFX_FSR2_OPTION_ACCUMULATE_SAMPLERS_USE_DATA_HALF=0 + -DFFX_FSR2_OPTION_REPROJECT_SAMPLERS_USE_DATA_HALF=1 + -DFFX_FSR2_OPTION_POSTPROCESSLOCKSTATUS_SAMPLERS_USE_DATA_HALF=0 + # Upsample uses lanczos approximation + -DFFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE=2 + ) set(FFX_SC_PERMUTATION_ARGS - -DFFX_FSR2_OPTION_USE_LANCZOS_LUT={0,1} + # Reproject can use either reference lanczos or LUT + -DFFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE={0,1} -DFFX_FSR2_OPTION_HDR_COLOR_INPUT={0,1} -DFFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS={0,1} -DFFX_FSR2_OPTION_JITTERED_MOTION_VECTORS={0,1} diff --git a/src/ffx-fsr2-api/dx12/CMakeLists.txt b/src/ffx-fsr2-api/dx12/CMakeLists.txt index bc44412..77a30c4 100644 --- a/src/ffx-fsr2-api/dx12/CMakeLists.txt +++ b/src/ffx-fsr2-api/dx12/CMakeLists.txt @@ -24,7 +24,7 @@ if(NOT ${FFX_FSR2_API_DX12}) endif() set(FFX_SC_DX12_BASE_ARGS - -E CS -Wno-for-redefinition -Wno-ambig-lit-shift -Wno-conversion -DFFX_HLSL=1 -DFFX_HLSL_6_2=1) + -E CS -Wno-for-redefinition -Wno-ambig-lit-shift -DFFX_HLSL=1 -DFFX_HLSL_6_2=1) file(GLOB SHADERS "${CMAKE_CURRENT_SOURCE_DIR}/../shaders/*.h" diff --git a/src/ffx-fsr2-api/dx12/ffx_fsr2_dx12.cpp b/src/ffx-fsr2-api/dx12/ffx_fsr2_dx12.cpp index a5b41e1..4b0f507 100644 --- a/src/ffx-fsr2-api/dx12/ffx_fsr2_dx12.cpp +++ b/src/ffx-fsr2-api/dx12/ffx_fsr2_dx12.cpp @@ -19,6 +19,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. +#include // convert string to wstring #include #include #include @@ -31,8 +32,8 @@ // DX12 prototypes for functions in the backend interface FfxErrorCode GetDeviceCapabilitiesDX12(FfxFsr2Interface* backendInterface, FfxDeviceCapabilities* deviceCapabilities, FfxDevice device); -FfxErrorCode CreateDeviceDX12(FfxFsr2Interface* backendInterface, FfxDevice device); -FfxErrorCode DestroyDeviceDX12(FfxFsr2Interface* backendInterface, FfxDevice device); +FfxErrorCode CreateBackendContextDX12(FfxFsr2Interface* backendInterface, FfxDevice device); +FfxErrorCode DestroyBackendContextDX12(FfxFsr2Interface* backendInterface); FfxErrorCode CreateResourceDX12(FfxFsr2Interface* backendInterface, const FfxCreateResourceDescription* desc, FfxResourceInternal* outTexture); FfxErrorCode RegisterResourceDX12(FfxFsr2Interface* backendInterface, const FfxResource* inResource, FfxResourceInternal* outResourceInternal); FfxErrorCode UnregisterResourcesDX12(FfxFsr2Interface* backendInterface); @@ -40,14 +41,14 @@ FfxResourceDescription GetResourceDescriptorDX12(FfxFsr2Interface* backendInterf FfxErrorCode DestroyResourceDX12(FfxFsr2Interface* backendInterface, FfxResourceInternal resource); FfxErrorCode CreatePipelineDX12(FfxFsr2Interface* backendInterface, FfxFsr2Pass passId, const FfxPipelineDescription* desc, FfxPipelineState* outPass); FfxErrorCode DestroyPipelineDX12(FfxFsr2Interface* backendInterface, FfxPipelineState* pipeline); -FfxErrorCode ScheduleRenderJobDX12(FfxFsr2Interface* backendInterface, const FfxRenderJobDescription* job); -FfxErrorCode ExecuteRenderJobsDX12(FfxFsr2Interface* backendInterface, FfxCommandList commandList); +FfxErrorCode ScheduleGpuJobDX12(FfxFsr2Interface* backendInterface, const FfxGpuJobDescription* job); +FfxErrorCode ExecuteGpuJobsDX12(FfxFsr2Interface* backendInterface, FfxCommandList commandList); #define FSR2_MAX_QUEUED_FRAMES ( 4) #define FSR2_MAX_RESOURCE_COUNT (64) #define FSR2_DESC_RING_SIZE (FSR2_MAX_QUEUED_FRAMES * FFX_FSR2_PASS_COUNT * FSR2_MAX_RESOURCE_COUNT) #define FSR2_MAX_BARRIERS (16) -#define FSR2_MAX_RENDERJOBS (32) +#define FSR2_MAX_GPU_JOBS (32) #define FSR2_MAX_SAMPLERS ( 2) #define UPLOAD_JOB_COUNT (16) @@ -69,8 +70,8 @@ typedef struct BackendContext_DX12 { ID3D12Device* device = nullptr; - FfxRenderJobDescription renderJobs[FSR2_MAX_RENDERJOBS] = {}; - uint32_t renderJobCount; + FfxGpuJobDescription gpuJobs[FSR2_MAX_GPU_JOBS] = {}; + uint32_t gpuJobCount; uint32_t nextStaticResource; uint32_t nextDynamicResource; @@ -113,8 +114,8 @@ FfxErrorCode ffxFsr2GetInterfaceDX12( FFX_ERROR_INSUFFICIENT_MEMORY); outInterface->fpGetDeviceCapabilities = GetDeviceCapabilitiesDX12; - outInterface->fpCreateDevice = CreateDeviceDX12; - outInterface->fpDestroyDevice = DestroyDeviceDX12; + outInterface->fpCreateBackendContext = CreateBackendContextDX12; + outInterface->fpDestroyBackendContext = DestroyBackendContextDX12; outInterface->fpCreateResource = CreateResourceDX12; outInterface->fpRegisterResource = RegisterResourceDX12; outInterface->fpUnregisterResources = UnregisterResourcesDX12; @@ -122,8 +123,8 @@ FfxErrorCode ffxFsr2GetInterfaceDX12( outInterface->fpDestroyResource = DestroyResourceDX12; outInterface->fpCreatePipeline = CreatePipelineDX12; outInterface->fpDestroyPipeline = DestroyPipelineDX12; - outInterface->fpScheduleRenderJob = ScheduleRenderJobDX12; - outInterface->fpExecuteRenderJobs = ExecuteRenderJobsDX12; + outInterface->fpScheduleGpuJob = ScheduleGpuJobDX12; + outInterface->fpExecuteGpuJobs = ExecuteGpuJobsDX12; outInterface->scratchBuffer = scratchBuffer; outInterface->scratchBufferSize = scratchBufferSize; @@ -268,6 +269,8 @@ DXGI_FORMAT ffxGetDX12FormatFromSurfaceFormat(FfxSurfaceFormat surfaceFormat) return DXGI_FORMAT_R16_SNORM; case(FFX_SURFACE_FORMAT_R8_UNORM): return DXGI_FORMAT_R8_UNORM; + case(FFX_SURFACE_FORMAT_R8G8_UNORM): + return DXGI_FORMAT_R8G8_UNORM; case(FFX_SURFACE_FORMAT_R32_FLOAT): return DXGI_FORMAT_R32_FLOAT; default: @@ -323,7 +326,7 @@ FfxSurfaceFormat ffxGetSurfaceFormatDX12(DXGI_FORMAT format) } // register a DX12 resource to the backend -FfxResource ffxGetResourceDX12(FfxFsr2Context* context, ID3D12Resource* dx12Resource, wchar_t* name, FfxResourceStates state, UINT shaderComponentMapping) +FfxResource ffxGetResourceDX12(FfxFsr2Context* context, ID3D12Resource* dx12Resource, const wchar_t* name, FfxResourceStates state, UINT shaderComponentMapping) { FfxResource resource = {}; resource.resource = reinterpret_cast(dx12Resource); @@ -601,7 +604,7 @@ FfxErrorCode GetDeviceCapabilitiesDX12(FfxFsr2Interface* backendInterface, FfxDe } // initialize the DX12 backend -FfxErrorCode CreateDeviceDX12(FfxFsr2Interface* backendInterface, FfxDevice device) +FfxErrorCode CreateBackendContextDX12(FfxFsr2Interface* backendInterface, FfxDevice device) { HRESULT result = S_OK; ID3D12Device* dx12Device = reinterpret_cast(device); @@ -651,12 +654,9 @@ FfxErrorCode CreateDeviceDX12(FfxFsr2Interface* backendInterface, FfxDevice devi } // deinitialize the DX12 backend -FfxErrorCode DestroyDeviceDX12(FfxFsr2Interface* backendInterface, FfxDevice device) +FfxErrorCode DestroyBackendContextDX12(FfxFsr2Interface* backendInterface) { - ID3D12Device* dx12Device = reinterpret_cast(device); - FFX_ASSERT(NULL != backendInterface); - FFX_ASSERT(NULL != dx12Device); BackendContext_DX12* backendContext = (BackendContext_DX12*)backendInterface->scratchBuffer; backendContext->descHeapSrvCpu->Release(); @@ -675,10 +675,10 @@ FfxErrorCode DestroyDeviceDX12(FfxFsr2Interface* backendInterface, FfxDevice dev backendContext->nextStaticResource = 0; - if (dx12Device != NULL) { + if (backendContext->device != NULL) { - dx12Device->Release(); - dx12Device = NULL; + backendContext->device->Release(); + backendContext->device = NULL; } return FFX_OK; @@ -695,8 +695,8 @@ FfxErrorCode CreateResourceDX12( FFX_ASSERT(NULL != createResourceDescription); FFX_ASSERT(NULL != outTexture); - ID3D12Device* dx12Device = reinterpret_cast(createResourceDescription->device); BackendContext_DX12* backendContext = (BackendContext_DX12*)backendInterface->scratchBuffer; + ID3D12Device* dx12Device = backendContext->device; FFX_ASSERT(NULL != dx12Device); @@ -902,14 +902,14 @@ FfxErrorCode CreateResourceDX12( backendInterface->fpCreateResource(backendInterface, &uploadDescription, ©Src); // setup the upload job - FfxRenderJobDescription copyJob = { + FfxGpuJobDescription copyJob = { - FFX_RENDER_JOB_COPY + FFX_GPU_JOB_COPY }; copyJob.copyJobDescriptor.src = copySrc; copyJob.copyJobDescriptor.dst = *outTexture; - backendInterface->fpScheduleRenderJob(backendInterface, ©Job); + backendInterface->fpScheduleGpuJob(backendInterface, ©Job); } } @@ -988,12 +988,11 @@ FfxErrorCode CreatePipelineDX12( flags |= (pipelineDescription->contextFlags & FFX_FSR2_ENABLE_MOTION_VECTORS_JITTER_CANCELLATION) ? FSR2_SHADER_PERMUTATION_JITTER_MOTION_VECTORS : 0; flags |= (pipelineDescription->contextFlags & FFX_FSR2_ENABLE_DEPTH_INVERTED) ? FSR2_SHADER_PERMUTATION_DEPTH_INVERTED : 0; flags |= (pass == FFX_FSR2_PASS_ACCUMULATE_SHARPEN) ? FSR2_SHADER_PERMUTATION_ENABLE_SHARPENING : 0; - flags |= (useLut) ? FSR2_SHADER_PERMUTATION_LANCZOS_LUT : 0; + flags |= (useLut) ? FSR2_SHADER_PERMUTATION_USE_LANCZOS_TYPE : 0; flags |= (canForceWave64) ? FSR2_SHADER_PERMUTATION_FORCE_WAVE64 : 0; - flags |= (supportedFP16) ? FSR2_SHADER_PERMUTATION_ALLOW_FP16 : 0; + flags |= (supportedFP16 && (pass != FFX_FSR2_PASS_RCAS)) ? FSR2_SHADER_PERMUTATION_ALLOW_FP16 : 0; - Fsr2ShaderBlobDX12 shaderBlob = { }; - fsr2GetPermutationBlobByIndex(pass, flags, &shaderBlob); + const Fsr2ShaderBlobDX12 shaderBlob = fsr2GetPermutationBlobByIndex(pass, flags); FFX_ASSERT(shaderBlob.data && shaderBlob.size); // set up root signature @@ -1168,20 +1167,21 @@ FfxErrorCode CreatePipelineDX12( outPipeline->srvCount = shaderBlob.srvCount; outPipeline->uavCount = shaderBlob.uavCount; outPipeline->constCount = shaderBlob.cbvCount; + std::wstring_convert> converter; for (uint32_t srvIndex = 0; srvIndex < outPipeline->srvCount; ++srvIndex) { outPipeline->srvResourceBindings[srvIndex].slotIndex = shaderBlob.boundSRVResources[srvIndex]; - strcpy_s(outPipeline->srvResourceBindings[srvIndex].name, shaderBlob.boundSRVResourceNames[srvIndex]); + wcscpy_s(outPipeline->srvResourceBindings[srvIndex].name, converter.from_bytes(shaderBlob.boundSRVResourceNames[srvIndex]).c_str()); } for (uint32_t uavIndex = 0; uavIndex < outPipeline->uavCount; ++uavIndex) { outPipeline->uavResourceBindings[uavIndex].slotIndex = shaderBlob.boundUAVResources[uavIndex]; - strcpy_s(outPipeline->uavResourceBindings[uavIndex].name, shaderBlob.boundUAVResourceNames[uavIndex]); + wcscpy_s(outPipeline->uavResourceBindings[uavIndex].name, converter.from_bytes(shaderBlob.boundUAVResourceNames[uavIndex]).c_str()); } for (uint32_t cbIndex = 0; cbIndex < outPipeline->constCount; ++cbIndex) { outPipeline->cbResourceBindings[cbIndex].slotIndex = shaderBlob.boundCBVResources[cbIndex]; - strcpy_s(outPipeline->cbResourceBindings[cbIndex].name, shaderBlob.boundCBVResourceNames[cbIndex]); + wcscpy_s(outPipeline->cbResourceBindings[cbIndex].name, converter.from_bytes(shaderBlob.boundCBVResourceNames[cbIndex]).c_str()); } // create the PSO @@ -1200,9 +1200,9 @@ FfxErrorCode CreatePipelineDX12( return FFX_OK; } -FfxErrorCode ScheduleRenderJobDX12( +FfxErrorCode ScheduleGpuJobDX12( FfxFsr2Interface* backendInterface, - const FfxRenderJobDescription* job + const FfxGpuJobDescription* job ) { FFX_ASSERT(NULL != backendInterface); @@ -1210,14 +1210,14 @@ FfxErrorCode ScheduleRenderJobDX12( BackendContext_DX12* backendContext = (BackendContext_DX12*)backendInterface->scratchBuffer; - FFX_ASSERT(backendContext->renderJobCount < FSR2_MAX_RENDERJOBS); + FFX_ASSERT(backendContext->gpuJobCount < FSR2_MAX_GPU_JOBS); - backendContext->renderJobs[backendContext->renderJobCount] = *job; + backendContext->gpuJobs[backendContext->gpuJobCount] = *job; - if (job->jobType == FFX_RENDER_JOB_COMPUTE) { + if (job->jobType == FFX_GPU_JOB_COMPUTE) { // needs to copy SRVs and UAVs in case they are on the stack only - FfxComputeJobDescription* computeJob = &backendContext->renderJobs[backendContext->renderJobCount].computeJobDescriptor; + FfxComputeJobDescription* computeJob = &backendContext->gpuJobs[backendContext->gpuJobCount].computeJobDescriptor; const uint32_t numConstBuffers = job->computeJobDescriptor.pipeline.constCount; for (uint32_t currentRootConstantIndex = 0; currentRootConstantIndex< numConstBuffers; ++currentRootConstantIndex) { @@ -1226,7 +1226,7 @@ FfxErrorCode ScheduleRenderJobDX12( } } - backendContext->renderJobCount++; + backendContext->gpuJobCount++; return FFX_OK; } @@ -1272,7 +1272,7 @@ void flushBarriers(BackendContext_DX12* backendContext, ID3D12GraphicsCommandLis } } -static FfxErrorCode executeRenderJobCompute(BackendContext_DX12* backendContext, FfxRenderJobDescription* job, ID3D12Device* dx12Device, ID3D12GraphicsCommandList* dx12CommandList) +static FfxErrorCode executeGpuJobCompute(BackendContext_DX12* backendContext, FfxGpuJobDescription* job, ID3D12Device* dx12Device, ID3D12GraphicsCommandList* dx12CommandList) { ID3D12DescriptorHeap* dx12DescriptorHeap = reinterpret_cast(backendContext->descRingBuffer); @@ -1390,7 +1390,7 @@ static FfxErrorCode executeRenderJobCompute(BackendContext_DX12* backendContext, return FFX_OK; } -static FfxErrorCode executeRenderJobCopy(BackendContext_DX12* backendContext, FfxRenderJobDescription* job, ID3D12Device* dx12Device, ID3D12GraphicsCommandList* dx12CommandList) +static FfxErrorCode executeGpuJobCopy(BackendContext_DX12* backendContext, FfxGpuJobDescription* job, ID3D12Device* dx12Device, ID3D12GraphicsCommandList* dx12CommandList) { ID3D12Resource* dx12ResourceSrc = getDX12ResourcePtr(backendContext, job->copyJobDescriptor.src.internalIndex); ID3D12Resource* dx12ResourceDst = getDX12ResourcePtr(backendContext, job->copyJobDescriptor.dst.internalIndex); @@ -1420,7 +1420,7 @@ static FfxErrorCode executeRenderJobCopy(BackendContext_DX12* backendContext, Ff return FFX_OK; } -static FfxErrorCode executeRenderJobClearFloat(BackendContext_DX12* backendContext, FfxRenderJobDescription* job, ID3D12Device* dx12Device, ID3D12GraphicsCommandList* dx12CommandList) +static FfxErrorCode executeGpuJobClearFloat(BackendContext_DX12* backendContext, FfxGpuJobDescription* job, ID3D12Device* dx12Device, ID3D12GraphicsCommandList* dx12CommandList) { uint32_t idx = job->clearJobDescriptor.target.internalIndex; BackendContext_DX12::Resource ffxResource = backendContext->resources[idx]; @@ -1444,7 +1444,7 @@ static FfxErrorCode executeRenderJobClearFloat(BackendContext_DX12* backendConte return FFX_OK; } -FfxErrorCode ExecuteRenderJobsDX12( +FfxErrorCode ExecuteGpuJobsDX12( FfxFsr2Interface* backendInterface, FfxCommandList commandList) { @@ -1454,25 +1454,25 @@ FfxErrorCode ExecuteRenderJobsDX12( FfxErrorCode errorCode = FFX_OK; - // execute all renderjobs - for (uint32_t currentRenderJobIndex = 0; currentRenderJobIndex < backendContext->renderJobCount; ++currentRenderJobIndex) { + // execute all GpuJobs + for (uint32_t currentGpuJobIndex = 0; currentGpuJobIndex < backendContext->gpuJobCount; ++currentGpuJobIndex) { - FfxRenderJobDescription* renderJob = &backendContext->renderJobs[currentRenderJobIndex]; + FfxGpuJobDescription* GpuJob = &backendContext->gpuJobs[currentGpuJobIndex]; ID3D12GraphicsCommandList* dx12CommandList = reinterpret_cast(commandList); ID3D12Device* dx12Device = reinterpret_cast(backendContext->device); - switch (renderJob->jobType) { + switch (GpuJob->jobType) { - case FFX_RENDER_JOB_CLEAR_FLOAT: - errorCode = executeRenderJobClearFloat(backendContext, renderJob, dx12Device, dx12CommandList); + case FFX_GPU_JOB_CLEAR_FLOAT: + errorCode = executeGpuJobClearFloat(backendContext, GpuJob, dx12Device, dx12CommandList); break; - case FFX_RENDER_JOB_COPY: - errorCode = executeRenderJobCopy(backendContext, renderJob, dx12Device, dx12CommandList); + case FFX_GPU_JOB_COPY: + errorCode = executeGpuJobCopy(backendContext, GpuJob, dx12Device, dx12CommandList); break; - case FFX_RENDER_JOB_COMPUTE: - errorCode = executeRenderJobCompute(backendContext, renderJob, dx12Device, dx12CommandList); + case FFX_GPU_JOB_COMPUTE: + errorCode = executeGpuJobCompute(backendContext, GpuJob, dx12Device, dx12CommandList); break; default: @@ -1485,7 +1485,7 @@ FfxErrorCode ExecuteRenderJobsDX12( errorCode == FFX_OK, FFX_ERROR_BACKEND_API_ERROR); - backendContext->renderJobCount = 0; + backendContext->gpuJobCount = 0; return FFX_OK; } diff --git a/src/ffx-fsr2-api/dx12/ffx_fsr2_dx12.h b/src/ffx-fsr2-api/dx12/ffx_fsr2_dx12.h index 4129c0a..d3626fc 100644 --- a/src/ffx-fsr2-api/dx12/ffx_fsr2_dx12.h +++ b/src/ffx-fsr2-api/dx12/ffx_fsr2_dx12.h @@ -93,7 +93,7 @@ FFX_API FfxCommandList ffxGetCommandListDX12(ID3D12CommandList* cmdList); FFX_API FfxResource ffxGetResourceDX12( FfxFsr2Context* context, ID3D12Resource* resDx12, - wchar_t* name = nullptr, + const wchar_t* name = nullptr, FfxResourceStates state = FFX_RESOURCE_STATE_COMPUTE_READ, UINT shaderComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING); diff --git a/src/ffx-fsr2-api/dx12/shaders/ffx_fsr2_shaders_dx12.cpp b/src/ffx-fsr2-api/dx12/shaders/ffx_fsr2_shaders_dx12.cpp index 6e74429..c61555b 100644 --- a/src/ffx-fsr2-api/dx12/shaders/ffx_fsr2_shaders_dx12.cpp +++ b/src/ffx-fsr2-api/dx12/shaders/ffx_fsr2_shaders_dx12.cpp @@ -56,14 +56,12 @@ #include "ffx_fsr2_reconstruct_previous_depth_pass_wave64_16bit_permutations.h" #include "ffx_fsr2_rcas_pass_wave64_16bit_permutations.h" -#include // for memset - #if defined(POPULATE_PERMUTATION_KEY) #undef POPULATE_PERMUTATION_KEY #endif // #if defined(POPULATE_PERMUTATION_KEY) #define POPULATE_PERMUTATION_KEY(options, key) \ key.index = 0; \ -key.FFX_FSR2_OPTION_USE_LANCZOS_LUT = FFX_CONTAINS_FLAG(options, FSR2_SHADER_PERMUTATION_LANCZOS_LUT); \ +key.FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE = FFX_CONTAINS_FLAG(options, FSR2_SHADER_PERMUTATION_USE_LANCZOS_TYPE); \ key.FFX_FSR2_OPTION_HDR_COLOR_INPUT = FFX_CONTAINS_FLAG(options, FSR2_SHADER_PERMUTATION_HDR_COLOR_INPUT); \ key.FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS = FFX_CONTAINS_FLAG(options, FSR2_SHADER_PERMUTATION_LOW_RES_MOTION_VECTORS); \ key.FFX_FSR2_OPTION_JITTERED_MOTION_VECTORS = FFX_CONTAINS_FLAG(options, FSR2_SHADER_PERMUTATION_JITTER_MOTION_VECTORS); \ @@ -278,10 +276,7 @@ static Fsr2ShaderBlobDX12 fsr2GetComputeLuminancePyramidPassPermutationBlobByInd } } -static Fsr2ShaderBlobDX12 fsr2GetAutogenReactivePassPermutationBlobByIndex( - uint32_t permutationOptions, - bool isWave64, - bool is16bit) { +static Fsr2ShaderBlobDX12 fsr2GetAutogenReactivePassPermutationBlobByIndex(uint32_t permutationOptions, bool isWave64, bool is16bit) { ffx_fsr2_autogen_reactive_pass_PermutationKey key; @@ -315,10 +310,7 @@ static Fsr2ShaderBlobDX12 fsr2GetAutogenReactivePassPermutationBlobByIndex( } } -FfxErrorCode fsr2GetPermutationBlobByIndex( - FfxFsr2Pass passId, - uint32_t permutationOptions, - Fsr2ShaderBlobDX12* outBlob) { +Fsr2ShaderBlobDX12 fsr2GetPermutationBlobByIndex(FfxFsr2Pass passId, uint32_t permutationOptions) { bool isWave64 = FFX_CONTAINS_FLAG(permutationOptions, FSR2_SHADER_PERMUTATION_FORCE_WAVE64); bool is16bit = FFX_CONTAINS_FLAG(permutationOptions, FSR2_SHADER_PERMUTATION_ALLOW_FP16); @@ -326,68 +318,28 @@ FfxErrorCode fsr2GetPermutationBlobByIndex( switch (passId) { case FFX_FSR2_PASS_PREPARE_INPUT_COLOR: - { - Fsr2ShaderBlobDX12 blob = fsr2GetPrepareInputColorPassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobDX12)); - return FFX_OK; - } - + return fsr2GetPrepareInputColorPassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); case FFX_FSR2_PASS_DEPTH_CLIP: - { - Fsr2ShaderBlobDX12 blob = fsr2GetDepthClipPassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobDX12)); - return FFX_OK; - } - + return fsr2GetDepthClipPassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); case FFX_FSR2_PASS_RECONSTRUCT_PREVIOUS_DEPTH: - { - Fsr2ShaderBlobDX12 blob = fsr2GetReconstructPreviousDepthPassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobDX12)); - return FFX_OK; - } - + return fsr2GetReconstructPreviousDepthPassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); case FFX_FSR2_PASS_LOCK: - { - Fsr2ShaderBlobDX12 blob = fsr2GetLockPassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobDX12)); - return FFX_OK; - } - + return fsr2GetLockPassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); case FFX_FSR2_PASS_ACCUMULATE: case FFX_FSR2_PASS_ACCUMULATE_SHARPEN: - { - Fsr2ShaderBlobDX12 blob = fsr2GetAccumulatePassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobDX12)); - return FFX_OK; - } - + return fsr2GetAccumulatePassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); case FFX_FSR2_PASS_RCAS: - { - Fsr2ShaderBlobDX12 blob = fsr2GetRCASPassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobDX12)); - return FFX_OK; - } - + return fsr2GetRCASPassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); case FFX_FSR2_PASS_COMPUTE_LUMINANCE_PYRAMID: - { - Fsr2ShaderBlobDX12 blob = fsr2GetComputeLuminancePyramidPassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobDX12)); - return FFX_OK; - } - + return fsr2GetComputeLuminancePyramidPassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); case FFX_FSR2_PASS_GENERATE_REACTIVE: - { - Fsr2ShaderBlobDX12 blob = fsr2GetAutogenReactivePassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobDX12)); - return FFX_OK; - } - + return fsr2GetAutogenReactivePassPermutationBlobByIndex(permutationOptions, isWave64, is16bit); default: FFX_ASSERT_FAIL("Should never reach here."); break; } // return an empty blob - memset(&outBlob, 0, sizeof(Fsr2ShaderBlobDX12)); - return FFX_OK; + Fsr2ShaderBlobDX12 emptyBlob = {}; + return emptyBlob; } diff --git a/src/ffx-fsr2-api/dx12/shaders/ffx_fsr2_shaders_dx12.h b/src/ffx-fsr2-api/dx12/shaders/ffx_fsr2_shaders_dx12.h index bee0c1b..70a4003 100644 --- a/src/ffx-fsr2-api/dx12/shaders/ffx_fsr2_shaders_dx12.h +++ b/src/ffx-fsr2-api/dx12/shaders/ffx_fsr2_shaders_dx12.h @@ -35,7 +35,7 @@ typedef struct Fsr2ShaderBlobDX12 { const uint32_t size; // Size in bytes. const uint32_t uavCount; // Number of UAV. const uint32_t srvCount; // Number of SRV. - const uint32_t cbvCount; // Number of CBs. + const uint32_t cbvCount; // Number of CBs. const char** boundUAVResourceNames; const uint32_t* boundUAVResources; // Pointer to an array of bound UAV resources. const char** boundSRVResourceNames; @@ -47,7 +47,7 @@ typedef struct Fsr2ShaderBlobDX12 { // The different options which contribute to permutations. typedef enum Fs2ShaderPermutationOptionsDX12 { - FSR2_SHADER_PERMUTATION_LANCZOS_LUT = (1<<0), // FFX_FSR2_OPTION_USE_LANCZOS_LUT + FSR2_SHADER_PERMUTATION_USE_LANCZOS_TYPE = (1<<0), // FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE. Off means reference, On means LUT FSR2_SHADER_PERMUTATION_HDR_COLOR_INPUT = (1<<1), // FFX_FSR2_OPTION_HDR_COLOR_INPUT FSR2_SHADER_PERMUTATION_LOW_RES_MOTION_VECTORS = (1<<2), // FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS FSR2_SHADER_PERMUTATION_JITTER_MOTION_VECTORS = (1<<3), // FFX_FSR2_OPTION_JITTERED_MOTION_VECTORS @@ -58,10 +58,7 @@ typedef enum Fs2ShaderPermutationOptionsDX12 { } Fs2ShaderPermutationOptionsDX12; // Get a DX12 shader blob for the specified pass and permutation index. -FfxErrorCode fsr2GetPermutationBlobByIndex( - FfxFsr2Pass passId, - uint32_t permutationOptions, - Fsr2ShaderBlobDX12* outBlob); +Fsr2ShaderBlobDX12 fsr2GetPermutationBlobByIndex(FfxFsr2Pass passId, uint32_t permutationOptions); #if defined(__cplusplus) } diff --git a/src/ffx-fsr2-api/ffx_assert.cpp b/src/ffx-fsr2-api/ffx_assert.cpp index 03680fd..7705490 100644 --- a/src/ffx-fsr2-api/ffx_assert.cpp +++ b/src/ffx-fsr2-api/ffx_assert.cpp @@ -23,7 +23,9 @@ #include // for malloc() #ifdef _WIN32 +#ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN +#endif #include // required for OutputDebugString() #include // required for sprintf_s #endif // #ifndef _WIN32 @@ -47,8 +49,8 @@ bool ffxAssertReport(const char* file, int32_t line, const char* condition, cons #ifdef _WIN32 // form the final assertion string and output to the TTY. - const size_t bufferSize = snprintf(NULL, 0, "%s(%d): ASSERTION FAILED. %s\n", file, line, message ? message : condition) + 1; - char* tempBuf = (char*)malloc(bufferSize); + const size_t bufferSize = static_cast(snprintf(nullptr, 0, "%s(%d): ASSERTION FAILED. %s\n", file, line, message ? message : condition)) + 1; + char* tempBuf = static_cast(malloc(bufferSize)); if (!tempBuf) { return true; diff --git a/src/ffx-fsr2-api/ffx_assert.h b/src/ffx-fsr2-api/ffx_assert.h index b6daee2..f96b157 100644 --- a/src/ffx-fsr2-api/ffx_assert.h +++ b/src/ffx-fsr2-api/ffx_assert.h @@ -65,7 +65,7 @@ typedef void (*FfxAssertCallback)(const char* message); /// @param [in] file The name of the file as a string. /// @param [in] line The index of the line in the file. /// @param [in] condition The boolean condition that was tested. -/// @param [in] message The optional message to print. +/// @param [in] msg The optional message to print. /// /// @returns /// Always returns true. @@ -78,7 +78,7 @@ FFX_API bool ffxAssertReport(const char* file, int32_t line, const char* conditi /// FFX_API void ffxAssertSetPrintingCallback(FfxAssertCallback callback); -#if _DEBUG +#ifdef _DEBUG /// Standard assert macro. #define FFX_ASSERT(condition) \ do \ diff --git a/src/ffx-fsr2-api/ffx_fsr2.cpp b/src/ffx-fsr2-api/ffx_fsr2.cpp index 39d8f37..ebd69d5 100644 --- a/src/ffx-fsr2-api/ffx_fsr2.cpp +++ b/src/ffx-fsr2-api/ffx_fsr2.cpp @@ -32,6 +32,10 @@ #include "ffx_fsr2_maximum_bias.h" +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wunused-variable" +#endif + // max queued frames for descriptor management static const uint32_t FSR2_MAX_QUEUED_FRAMES = 16; @@ -41,60 +45,60 @@ static const uint32_t FSR2_MAX_QUEUED_FRAMES = 16; typedef struct ResourceBinding { uint32_t index; - char name[64]; + wchar_t name[64]; }ResourceBinding; static const ResourceBinding srvResourceBindingTable[] = { - {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR, "r_input_color_jittered"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_MOTION_VECTORS, "r_motion_vectors"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_DEPTH, "r_depth" }, - {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_EXPOSURE, "r_exposure"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK, "r_reactive_mask"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_TRANSPARENCY_AND_COMPOSITION_MASK, "r_transparency_and_composition_mask"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH, "r_ReconstructedPrevNearestDepth"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS, "r_dilated_motion_vectors"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH, "r_dilatedDepth"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR, "r_internal_upscaled_color"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS, "r_lock_status"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_DEPTH_CLIP, "r_depth_clip"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR, "r_prepared_input_color"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY, "r_luma_history" }, - {FFX_FSR2_RESOURCE_IDENTIFIER_RCAS_INPUT, "r_rcas_input"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_LANCZOS_LUT, "r_lanczos_lut"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE, "r_imgMips"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_SHADING_CHANGE, "r_img_mip_shading_change"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_5, "r_img_mip_5"}, - {FFX_FSR2_RESOURCE_IDENTITIER_UPSAMPLE_MAXIMUM_BIAS_LUT, "r_upsample_maximum_bias_lut"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_REACTIVE_MAX, "r_reactive_max"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR, L"r_input_color_jittered"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_MOTION_VECTORS, L"r_motion_vectors"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_DEPTH, L"r_depth" }, + {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_EXPOSURE, L"r_exposure"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK, L"r_reactive_mask"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_TRANSPARENCY_AND_COMPOSITION_MASK, L"r_transparency_and_composition_mask"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH, L"r_reconstructed_previous_nearest_depth"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS, L"r_dilated_motion_vectors"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH, L"r_dilatedDepth"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR, L"r_internal_upscaled_color"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS, L"r_lock_status"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DEPTH_CLIP, L"r_depth_clip"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR, L"r_prepared_input_color"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY, L"r_luma_history" }, + {FFX_FSR2_RESOURCE_IDENTIFIER_RCAS_INPUT, L"r_rcas_input"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_LANCZOS_LUT, L"r_lanczos_lut"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE, L"r_imgMips"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_SHADING_CHANGE, L"r_img_mip_shading_change"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_5, L"r_img_mip_5"}, + {FFX_FSR2_RESOURCE_IDENTITIER_UPSAMPLE_MAXIMUM_BIAS_LUT, L"r_upsample_maximum_bias_lut"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS, L"r_dilated_reactive_masks"}, }; static const ResourceBinding uavResourceBindingTable[] = { - {FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH, "rw_ReconstructedPrevNearestDepth"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS, "rw_dilated_motion_vectors"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH, "rw_dilatedDepth"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR, "rw_internal_upscaled_color"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS, "rw_lock_status"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_DEPTH_CLIP, "rw_depth_clip"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR, "rw_prepared_input_color"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY, "rw_luma_history"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_UPSCALED_OUTPUT, "rw_upscaled_output"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_SHADING_CHANGE, "rw_img_mip_shading_change"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_5, "rw_img_mip_5"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_REACTIVE_MAX, "rw_reactive_max"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_EXPOSURE, "rw_exposure"}, - {FFX_FSR2_RESOURCE_IDENTIFIER_SPD_ATOMIC_COUNT, "rw_spd_global_atomic"}, -#if defined(FFX_INTERNAL) - {FFX_FSR2_RESOURCE_IDENTIFIER_DEBUG_OUTPUT, "rw_debug_out"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH, L"rw_reconstructed_previous_nearest_depth"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS, L"rw_dilated_motion_vectors"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH, L"rw_dilatedDepth"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR, L"rw_internal_upscaled_color"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS, L"rw_lock_status"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DEPTH_CLIP, L"rw_depth_clip"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR, L"rw_prepared_input_color"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY, L"rw_luma_history"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_UPSCALED_OUTPUT, L"rw_upscaled_output"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_SHADING_CHANGE, L"rw_img_mip_shading_change"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_5, L"rw_img_mip_5"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS, L"rw_dilated_reactive_masks"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_EXPOSURE, L"rw_exposure"}, + {FFX_FSR2_RESOURCE_IDENTIFIER_SPD_ATOMIC_COUNT, L"rw_spd_global_atomic"}, +#if defined(FFX_INTERNAL) + {FFX_FSR2_RESOURCE_IDENTIFIER_DEBUG_OUTPUT, L"rw_debug_out"}, #endif }; static const ResourceBinding cbResourceBindingTable[] = { - {FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_FSR2, "cbFSR2"}, - {FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_SPD, "cbSPD"}, - {FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_RCAS, "cbRCAS"}, + {FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_FSR2, L"cbFSR2"}, + {FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_SPD, L"cbSPD"}, + {FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_RCAS, L"cbRCAS"}, }; // Broad structure of the root signature. @@ -124,8 +128,8 @@ typedef struct Fsr2GenerateReactiveConstants { float scale; float threshold; + float binaryValue; uint32_t flags; - float _Padding; } Fsr2GenerateReactiveConstants; typedef union Fsr2SecondaryUnion { @@ -182,7 +186,7 @@ static FfxErrorCode patchResourceBindings(FfxPipelineState* inoutPipeline) int32_t mapIndex = 0; for (mapIndex = 0; mapIndex < _countof(srvResourceBindingTable); ++mapIndex) { - if (0 == strcmp(srvResourceBindingTable[mapIndex].name, inoutPipeline->srvResourceBindings[srvIndex].name)) + if (0 == wcscmp(srvResourceBindingTable[mapIndex].name, inoutPipeline->srvResourceBindings[srvIndex].name)) break; } if (mapIndex == _countof(srvResourceBindingTable)) @@ -196,7 +200,7 @@ static FfxErrorCode patchResourceBindings(FfxPipelineState* inoutPipeline) int32_t mapIndex = 0; for (mapIndex = 0; mapIndex < _countof(uavResourceBindingTable); ++mapIndex) { - if (0 == strcmp(uavResourceBindingTable[mapIndex].name, inoutPipeline->uavResourceBindings[uavIndex].name)) + if (0 == wcscmp(uavResourceBindingTable[mapIndex].name, inoutPipeline->uavResourceBindings[uavIndex].name)) break; } if (mapIndex == _countof(uavResourceBindingTable)) @@ -210,7 +214,7 @@ static FfxErrorCode patchResourceBindings(FfxPipelineState* inoutPipeline) int32_t mapIndex = 0; for (mapIndex = 0; mapIndex < _countof(cbResourceBindingTable); ++mapIndex) { - if (0 == strcmp(cbResourceBindingTable[mapIndex].name, inoutPipeline->cbResourceBindings[cbIndex].name)) + if (0 == wcscmp(cbResourceBindingTable[mapIndex].name, inoutPipeline->cbResourceBindings[cbIndex].name)) break; } if (mapIndex == _countof(cbResourceBindingTable)) @@ -284,7 +288,7 @@ static FfxErrorCode fsr2Create(FfxFsr2Context_Private* context, const FfxFsr2Con memcpy(&context->contextDescription, contextDescription, sizeof(FfxFsr2ContextDescription)); // Create the device. - FfxErrorCode errorCode = context->contextDescription.callbacks.fpCreateDevice(&context->contextDescription.callbacks, context->device); + FfxErrorCode errorCode = context->contextDescription.callbacks.fpCreateBackendContext(&context->contextDescription.callbacks, context->device); FFX_RETURN_ON_ERROR(errorCode == FFX_OK, errorCode); // call out for device caps. @@ -362,8 +366,8 @@ static FfxErrorCode fsr2Create(FfxFsr2Context_Private* context, const FfxFsr2Con { FFX_FSR2_RESOURCE_IDENTIFIER_SPD_ATOMIC_COUNT, L"FSR2_SpdAtomicCounter", FFX_RESOURCE_USAGE_UAV, FFX_SURFACE_FORMAT_R32_UINT, 1, 1, 1, FFX_RESOURCE_FLAGS_ALIASABLE, sizeof(atomicInitData), &atomicInitData }, - { FFX_FSR2_RESOURCE_IDENTIFIER_REACTIVE_MAX, L"FSR2_ReactiveMaskMax", FFX_RESOURCE_USAGE_UAV, - FFX_SURFACE_FORMAT_R8_UNORM, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_ALIASABLE }, + { FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS, L"FSR2_DilatedReactiveMasks", FFX_RESOURCE_USAGE_UAV, + FFX_SURFACE_FORMAT_R8G8_UNORM, contextDescription->maxRenderSize.width, contextDescription->maxRenderSize.height, 1, FFX_RESOURCE_FLAGS_ALIASABLE }, { FFX_FSR2_RESOURCE_IDENTIFIER_LANCZOS_LUT, L"FSR2_LanczosLutData", FFX_RESOURCE_USAGE_READ_ONLY, FFX_SURFACE_FORMAT_R16_SNORM, lanczos2LutWidth, 1, 1, FFX_RESOURCE_FLAGS_NONE, sizeof(lanczos2Weights), lanczos2Weights }, @@ -395,7 +399,7 @@ static FfxErrorCode fsr2Create(FfxFsr2Context_Private* context, const FfxFsr2Con const FfxResourceType resourceType = currentSurfaceDescription->height > 1 ? FFX_RESOURCE_TYPE_TEXTURE2D : texture1dResourceType; const FfxResourceDescription resourceDescription = { resourceType, currentSurfaceDescription->format, currentSurfaceDescription->width, currentSurfaceDescription->height, 1, currentSurfaceDescription->mipCount }; const FfxResourceStates initialState = (currentSurfaceDescription->usage == FFX_RESOURCE_USAGE_READ_ONLY) ? FFX_RESOURCE_STATE_COMPUTE_READ : FFX_RESOURCE_STATE_UNORDERED_ACCESS; - const FfxCreateResourceDescription createResourceDescription = { context->device, FFX_HEAP_TYPE_DEFAULT, resourceDescription, initialState, currentSurfaceDescription->initDataSize, currentSurfaceDescription->initData, currentSurfaceDescription->name, currentSurfaceDescription->usage, currentSurfaceDescription->id }; + const FfxCreateResourceDescription createResourceDescription = { FFX_HEAP_TYPE_DEFAULT, resourceDescription, initialState, currentSurfaceDescription->initDataSize, currentSurfaceDescription->initData, currentSurfaceDescription->name, currentSurfaceDescription->usage, currentSurfaceDescription->id }; FFX_VALIDATE(context->contextDescription.callbacks.fpCreateResource(&context->contextDescription.callbacks, &createResourceDescription, &context->srvResources[currentSurfaceDescription->id])); } @@ -430,7 +434,7 @@ static void fsr2SafeReleaseDevice(FfxFsr2Context_Private* context, FfxDevice* de return; } - context->contextDescription.callbacks.fpDestroyDevice(&context->contextDescription.callbacks, *device); + context->contextDescription.callbacks.fpDestroyBackendContext(&context->contextDescription.callbacks); *device = nullptr; } @@ -480,13 +484,13 @@ static void scheduleDispatch(FfxFsr2Context_Private* context, const FfxFsr2Dispa const uint32_t currentResourceId = pipeline->srvResourceBindings[currentShaderResourceViewIndex].resourceIdentifier; const FfxResourceInternal currentResource = context->srvResources[currentResourceId]; jobDescriptor.srvs[currentShaderResourceViewIndex] = currentResource; - strcpy_s(jobDescriptor.srvNames[currentShaderResourceViewIndex], pipeline->srvResourceBindings[currentShaderResourceViewIndex].name); + wcscpy_s(jobDescriptor.srvNames[currentShaderResourceViewIndex], pipeline->srvResourceBindings[currentShaderResourceViewIndex].name); } for (uint32_t currentUnorderedAccessViewIndex = 0; currentUnorderedAccessViewIndex < pipeline->uavCount; ++currentUnorderedAccessViewIndex) { const uint32_t currentResourceId = pipeline->uavResourceBindings[currentUnorderedAccessViewIndex].resourceIdentifier; - strcpy_s(jobDescriptor.uavNames[currentUnorderedAccessViewIndex], pipeline->uavResourceBindings[currentUnorderedAccessViewIndex].name); + wcscpy_s(jobDescriptor.uavNames[currentUnorderedAccessViewIndex], pipeline->uavResourceBindings[currentUnorderedAccessViewIndex].name); if (currentResourceId >= FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_0 && currentResourceId <= FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_12) { @@ -508,14 +512,14 @@ static void scheduleDispatch(FfxFsr2Context_Private* context, const FfxFsr2Dispa jobDescriptor.pipeline = *pipeline; for (uint32_t currentRootConstantIndex = 0; currentRootConstantIndex < pipeline->constCount; ++currentRootConstantIndex) { - strcpy_s( jobDescriptor.cbNames[currentRootConstantIndex], pipeline->cbResourceBindings[currentRootConstantIndex].name); + wcscpy_s( jobDescriptor.cbNames[currentRootConstantIndex], pipeline->cbResourceBindings[currentRootConstantIndex].name); jobDescriptor.cbs[currentRootConstantIndex] = globalFsr2ConstantBuffers[pipeline->cbResourceBindings[currentRootConstantIndex].resourceIdentifier]; } - FfxRenderJobDescription dispatchJob = { FFX_RENDER_JOB_COMPUTE }; + FfxGpuJobDescription dispatchJob = { FFX_GPU_JOB_COMPUTE }; dispatchJob.computeJobDescriptor = jobDescriptor; - context->contextDescription.callbacks.fpScheduleRenderJob(&context->contextDescription.callbacks, &dispatchJob); + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &dispatchJob); } static FfxErrorCode fsr2Dispatch(FfxFsr2Context_Private* context, const FfxFsr2DispatchDescription* params) @@ -532,26 +536,28 @@ static FfxErrorCode fsr2Dispatch(FfxFsr2Context_Private* context, const FfxFsr2D FFX_RETURN_ON_ERROR(errorCode == FFX_OK, errorCode); } + static const float lockInitialLifetime = 1.0f; + if (context->firstExecution) { const float clearValuesToZeroFloat[]{ 0.f, 0.f, 0.f, 0.f }; - FfxRenderJobDescription clearJob = { FFX_RENDER_JOB_CLEAR_FLOAT }; + FfxGpuJobDescription clearJob = { FFX_GPU_JOB_CLEAR_FLOAT }; memcpy(clearJob.clearJobDescriptor.color, clearValuesToZeroFloat, 4 * sizeof(float)); clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_1]; - context->contextDescription.callbacks.fpScheduleRenderJob(&context->contextDescription.callbacks, &clearJob); + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS_2]; - context->contextDescription.callbacks.fpScheduleRenderJob(&context->contextDescription.callbacks, &clearJob); + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS]; - context->contextDescription.callbacks.fpScheduleRenderJob(&context->contextDescription.callbacks, &clearJob); + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR]; - context->contextDescription.callbacks.fpScheduleRenderJob(&context->contextDescription.callbacks, &clearJob); + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY]; - context->contextDescription.callbacks.fpScheduleRenderJob(&context->contextDescription.callbacks, &clearJob); + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_DEPTH_CLIP]; - context->contextDescription.callbacks.fpScheduleRenderJob(&context->contextDescription.callbacks, &clearJob); - clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_REACTIVE_MAX]; - context->contextDescription.callbacks.fpScheduleRenderJob(&context->contextDescription.callbacks, &clearJob); + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); + clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS]; + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); } // Prepare per frame descriptor tables @@ -666,7 +672,7 @@ static FfxErrorCode fsr2Dispatch(FfxFsr2Context_Private* context, const FfxFsr2D // lock data, assuming jitter sequence length computation for now const int32_t jitterPhaseCount = ffxFsr2GetJitterPhaseCount(params->renderSize.width, context->contextDescription.displaySize.width); - context->constants.lockInitialLifetime = 1.0f; + context->constants.lockInitialLifetime = lockInitialLifetime; // init on first frame if (resetAccumulation || context->constants.jitterPhaseCount == 0) { @@ -681,7 +687,7 @@ static FfxErrorCode fsr2Dispatch(FfxFsr2Context_Private* context, const FfxFsr2D } const int32_t maxLockFrames = (int32_t)(context->constants.jitterPhaseCount) + 1; - context->constants.lockTickDelta = context->constants.lockInitialLifetime / maxLockFrames; + context->constants.lockTickDelta = lockInitialLifetime / maxLockFrames; // convert delta time to seconds and clamp to [0, 1]. context->constants.deltaTime = FFX_MAXIMUM(0.0f, FFX_MINIMUM(1.0f, params->frameTimeDelta / 1000.0f)); @@ -711,32 +717,32 @@ static FfxErrorCode fsr2Dispatch(FfxFsr2Context_Private* context, const FfxFsr2D // Clear reconstructed depth for max depth store. if (resetAccumulation) { - FfxRenderJobDescription clearJob = { FFX_RENDER_JOB_CLEAR_FLOAT }; + FfxGpuJobDescription clearJob = { FFX_GPU_JOB_CLEAR_FLOAT }; // LockStatus resource has no sign bit, callback functions are compensating for this. // Clearing the resource must follow the same logic. float clearValuesLockStatus[4]{}; - clearValuesLockStatus[LOCK_LIFETIME_REMAINING] = context->constants.lockInitialLifetime * 2.0f; + clearValuesLockStatus[LOCK_LIFETIME_REMAINING] = lockInitialLifetime * 2.0f; clearValuesLockStatus[LOCK_TEMPORAL_LUMA] = 0.0f; clearValuesLockStatus[LOCK_TRUST] = 1.0f; memcpy(clearJob.clearJobDescriptor.color, clearValuesLockStatus, 4 * sizeof(float)); clearJob.clearJobDescriptor.target = context->srvResources[lockStatusSrvResourceIndex]; - context->contextDescription.callbacks.fpScheduleRenderJob(&context->contextDescription.callbacks, &clearJob); + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); const float clearValuesToZeroFloat[]{ 0.f, 0.f, 0.f, 0.f }; memcpy(clearJob.clearJobDescriptor.color, clearValuesToZeroFloat, 4 * sizeof(float)); clearJob.clearJobDescriptor.target = context->srvResources[upscaledColorSrvResourceIndex]; - context->contextDescription.callbacks.fpScheduleRenderJob(&context->contextDescription.callbacks, &clearJob); + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE]; - context->contextDescription.callbacks.fpScheduleRenderJob(&context->contextDescription.callbacks, &clearJob); + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); if (context->contextDescription.flags & FFX_FSR2_ENABLE_AUTO_EXPOSURE) { const float clearValuesExposure[]{ -1.f, 1e8f, 0.f, 0.f }; memcpy(clearJob.clearJobDescriptor.color, clearValuesExposure, 4 * sizeof(float)); clearJob.clearJobDescriptor.target = context->srvResources[FFX_FSR2_RESOURCE_IDENTIFIER_EXPOSURE]; - context->contextDescription.callbacks.fpScheduleRenderJob(&context->contextDescription.callbacks, &clearJob); + context->contextDescription.callbacks.fpScheduleGpuJob(&context->contextDescription.callbacks, &clearJob); } } @@ -791,7 +797,7 @@ static FfxErrorCode fsr2Dispatch(FfxFsr2Context_Private* context, const FfxFsr2D // Fsr2MaxQueuedFrames must be an even number. FFX_STATIC_ASSERT((FSR2_MAX_QUEUED_FRAMES & 1) == 0); - context->contextDescription.callbacks.fpExecuteRenderJobs(&context->contextDescription.callbacks, commandList); + context->contextDescription.callbacks.fpExecuteGpuJobs(&context->contextDescription.callbacks, commandList); // release dynamic resources context->contextDescription.callbacks.fpUnregisterResources(&context->contextDescription.callbacks); @@ -814,8 +820,8 @@ FfxErrorCode ffxFsr2ContextCreate(FfxFsr2Context* context, const FfxFsr2ContextD // validate that all callbacks are set for the interface FFX_RETURN_ON_ERROR(contextDescription->callbacks.fpGetDeviceCapabilities, FFX_ERROR_INCOMPLETE_INTERFACE); - FFX_RETURN_ON_ERROR(contextDescription->callbacks.fpCreateDevice, FFX_ERROR_INCOMPLETE_INTERFACE); - FFX_RETURN_ON_ERROR(contextDescription->callbacks.fpDestroyDevice, FFX_ERROR_INCOMPLETE_INTERFACE); + FFX_RETURN_ON_ERROR(contextDescription->callbacks.fpCreateBackendContext, FFX_ERROR_INCOMPLETE_INTERFACE); + FFX_RETURN_ON_ERROR(contextDescription->callbacks.fpDestroyBackendContext, FFX_ERROR_INCOMPLETE_INTERFACE); // if a scratch buffer is declared, then we must have a size if (contextDescription->callbacks.scratchBuffer) { @@ -875,6 +881,7 @@ FfxErrorCode ffxFsr2ContextDispatch(FfxFsr2Context* context, const FfxFsr2Dispat float ffxFsr2GetUpscaleRatioFromQualityMode(FfxFsr2QualityMode qualityMode) { switch (qualityMode) { + case FFX_FSR2_QUALITY_MODE_QUALITY: return 1.5f; case FFX_FSR2_QUALITY_MODE_BALANCED: @@ -915,6 +922,17 @@ FfxErrorCode ffxFsr2GetRenderResolutionFromQualityMode( return FFX_OK; } +FfxErrorCode ffxFsr2ContextEnqueueRefreshPipelineRequest(FfxFsr2Context* context) +{ + FFX_RETURN_ON_ERROR( + context, + FFX_ERROR_INVALID_POINTER); + + FfxFsr2Context_Private* contextPrivate = (FfxFsr2Context_Private*)context; + contextPrivate->refreshPipelineStates = true; + + return FFX_OK; +} int32_t ffxFsr2GetJitterPhaseCount(int32_t renderWidth, int32_t displayWidth) { @@ -985,9 +1003,9 @@ FfxErrorCode ffxFsr2ContextGenerateReactiveMask(FfxFsr2Context* context, const F contextPrivate->contextDescription.callbacks.fpRegisterResource(&contextPrivate->contextDescription.callbacks, ¶ms->colorOpaqueOnly, &jobDescriptor.srvs[0]); contextPrivate->contextDescription.callbacks.fpRegisterResource(&contextPrivate->contextDescription.callbacks, ¶ms->colorPreUpscale, &jobDescriptor.srvs[1]); contextPrivate->contextDescription.callbacks.fpRegisterResource(&contextPrivate->contextDescription.callbacks, ¶ms->outReactive, &jobDescriptor.uavs[0]); - strcpy_s(jobDescriptor.srvNames[0], pipeline->srvResourceBindings[0].name); - strcpy_s(jobDescriptor.srvNames[1], pipeline->srvResourceBindings[1].name); - strcpy_s(jobDescriptor.uavNames[0], pipeline->uavResourceBindings[0].name); + wcscpy_s(jobDescriptor.srvNames[0], pipeline->srvResourceBindings[0].name); + wcscpy_s(jobDescriptor.srvNames[1], pipeline->srvResourceBindings[1].name); + wcscpy_s(jobDescriptor.uavNames[0], pipeline->uavResourceBindings[0].name); jobDescriptor.dimensions[0] = dispatchSrcX; jobDescriptor.dimensions[1] = dispatchSrcY; @@ -997,18 +1015,19 @@ FfxErrorCode ffxFsr2ContextGenerateReactiveMask(FfxFsr2Context* context, const F Fsr2GenerateReactiveConstants constants = {}; constants.scale = params->scale; constants.threshold = params->cutoffThreshold; + constants.binaryValue = params->binaryValue; constants.flags = params->flags; jobDescriptor.cbs[0].uint32Size = sizeof(constants); memcpy(&jobDescriptor.cbs[0].data, &constants, sizeof(constants)); - strcpy_s(jobDescriptor.cbNames[0], pipeline->cbResourceBindings[0].name); + wcscpy_s(jobDescriptor.cbNames[0], pipeline->cbResourceBindings[0].name); - FfxRenderJobDescription dispatchJob = { FFX_RENDER_JOB_COMPUTE }; + FfxGpuJobDescription dispatchJob = { FFX_GPU_JOB_COMPUTE }; dispatchJob.computeJobDescriptor = jobDescriptor; - contextPrivate->contextDescription.callbacks.fpScheduleRenderJob(&contextPrivate->contextDescription.callbacks, &dispatchJob); + contextPrivate->contextDescription.callbacks.fpScheduleGpuJob(&contextPrivate->contextDescription.callbacks, &dispatchJob); - contextPrivate->contextDescription.callbacks.fpExecuteRenderJobs(&contextPrivate->contextDescription.callbacks, commandList); + contextPrivate->contextDescription.callbacks.fpExecuteGpuJobs(&contextPrivate->contextDescription.callbacks, commandList); return FFX_OK; } diff --git a/src/ffx-fsr2-api/ffx_fsr2.h b/src/ffx-fsr2-api/ffx_fsr2.h index cf0e2ee..ff96d71 100644 --- a/src/ffx-fsr2-api/ffx_fsr2.h +++ b/src/ffx-fsr2-api/ffx_fsr2.h @@ -19,6 +19,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. + // @defgroup FSR2 #pragma once @@ -34,12 +35,12 @@ /// FidelityFX Super Resolution 2 minor version. /// /// @ingroup FSR2 -#define FFX_FSR2_VERSION_MINOR (0) +#define FFX_FSR2_VERSION_MINOR (1) /// FidelityFX Super Resolution 2 patch version. /// /// @ingroup FSR2 -#define FFX_FSR2_VERSION_PATCH (1) +#define FFX_FSR2_VERSION_PATCH (0) /// The size of the context specified in 32bit values. /// @@ -146,6 +147,7 @@ typedef struct FfxFsr2GenerateReactiveDescription { FfxDimensions2D renderSize; ///< The resolution that was used for rendering the input resources. float scale; ///< A value to scale the output float cutoffThreshold; ///< A threshold value to generate a binary reactive mask + float binaryValue; ///< A value to set for the binary reactive mask uint32_t flags; ///< Flags to determine how to generate the reactive mask } FfxFsr2GenerateReactiveDescription; diff --git a/src/ffx-fsr2-api/ffx_fsr2_interface.h b/src/ffx-fsr2-api/ffx_fsr2_interface.h index 488c7b4..db13fd0 100644 --- a/src/ffx-fsr2-api/ffx_fsr2_interface.h +++ b/src/ffx-fsr2-api/ffx_fsr2_interface.h @@ -40,7 +40,7 @@ FFX_FORWARD_DECLARE(FfxFsr2Interface); /// /// FSR2 is implemented as a composite of several compute passes each /// computing a key part of the final result. Each call to the -/// FfxFsr2ScheduleRenderJobFunc callback function will +/// FfxFsr2ScheduleGpuJobFunc callback function will /// correspond to a single pass included in FfxFsr2Pass. For a /// more comprehensive description of each pass, please refer to the FSR2 /// reference documentation. @@ -68,50 +68,13 @@ typedef enum FfxFsr2Pass { FFX_FSR2_PASS_COUNT ///< The number of passes performed by FSR2. } FfxFsr2Pass; -/// A structure containing the description used to create a -/// FfxPipeline structure. +/// Create and initialize the backend context. /// -/// A pipeline is the name given to a shader and the collection of state that -/// is required to dispatch it. In the context of FSR2 and its architecture -/// this means that a FfxPipelineDescription will map to either a -/// monolithic object in an explicit API (such as a -/// PipelineStateObject in DirectX 12). Or a shader and some -/// ancillary API objects (in something like DirectX 11). -/// -/// The contextFlags field contains a copy of the flags passed -/// to ffxFsr2ContextCreate via the flags field of -/// the FfxFsr2InitializationParams structure. These flags are -/// used to determine which permutation of a pipeline for a specific -/// FfxFsr2Pass should be used to implement the features required -/// by each application, as well as to acheive the best performance on specific -/// target hardware configurations. -/// -/// When using one of the provided backends for FSR2 (such as DirectX 12 or -/// Vulkan) the data required to create a pipeline is compiled offline and -/// included into the backend library that you are using. For cases where the -/// backend interface is overriden by providing custom callback function -/// implementations care should be taken to respect the contents of the -/// contextFlags field in order to correctly support the options -/// provided by FSR2, and acheive best performance. -/// -/// @ingroup FSR2 -typedef struct FfxPipelineDescription { - - uint32_t contextFlags; ///< A collection of FfxFsr2InitializationFlagBits which were passed to the context. - FfxFilterType* samplers; ///< Array of static samplers. - size_t samplerCount; ///< The number of samples contained inside samplers. - const uint32_t* rootConstantBufferSizes; ///< Array containing the sizes of the root constant buffers (count of 32 bit elements). - uint32_t rootConstantBufferCount; ///< The number of root constants contained within rootConstantBufferSizes. -} FfxPipelineDescription; - -/// Create (or reference) a device. -/// -/// The callback function should either create a new device or (more likely) it -/// should return an already existing device and add a reference to it (for -/// those APIs which implement reference counting, such as DirectX 11 and 12). +/// The callback function sets up the backend context for rendering. +/// It will create or reference the device and create required internal data structures. /// /// @param [in] backendInterface A pointer to the backend interface. -/// @param [out] outDevice The device that is either created (or referenced). +/// @param [in] device The FfxDevice obtained by ffxGetDevice(DX12/VK/...). /// /// @retval /// FFX_OK The operation completed successfully. @@ -119,9 +82,9 @@ typedef struct FfxPipelineDescription { /// Anything else The operation failed. /// /// @ingroup FSR2 -typedef FfxErrorCode (*FfxFsr2CreateDeviceFunc)( +typedef FfxErrorCode (*FfxFsr2CreateBackendContextFunc)( FfxFsr2Interface* backendInterface, - FfxDevice outDevice); + FfxDevice device); /// Get a list of capabilities of the device. /// @@ -153,12 +116,11 @@ typedef FfxErrorCode(*FfxFsr2GetDeviceCapabilitiesFunc)( FfxDeviceCapabilities* outDeviceCapabilities, FfxDevice device); -/// Destroy (or dereference) a device. +/// Destroy the backend context and dereference the device. /// /// This function is called when the FfxFsr2Context is destroyed. /// /// @param [in] backendInterface A pointer to the backend interface. -/// @param [in] device The FfxDevice object to be destroyed (or deferenced). /// /// @retval /// FFX_OK The operation completed successfully. @@ -166,9 +128,8 @@ typedef FfxErrorCode(*FfxFsr2GetDeviceCapabilitiesFunc)( /// Anything else The operation failed. /// /// @ingroup FSR2 -typedef FfxErrorCode(*FfxFsr2DestroyDeviceFunc)( - FfxFsr2Interface* backendInterface, - FfxDevice device); +typedef FfxErrorCode(*FfxFsr2DestroyBackendContextFunc)( + FfxFsr2Interface* backendInterface); /// Create a resource. /// @@ -308,13 +269,13 @@ typedef FfxErrorCode (*FfxFsr2DestroyPipelineFunc)( FfxPipelineState* pipeline); /// Schedule a render job to be executed on the next call of -/// FfxFsr2ExecuteRenderJobsFunc. +/// FfxFsr2ExecuteGpuJobsFunc. /// /// Render jobs can perform one of three different tasks: clear, copy or /// compute dispatches. /// /// @param [in] backendInterface A pointer to the backend interface. -/// @param [in] job A pointer to a FfxRenderJobDescription structure. +/// @param [in] job A pointer to a FfxGpuJobDescription structure. /// /// @retval /// FFX_OK The operation completed successfully. @@ -322,15 +283,15 @@ typedef FfxErrorCode (*FfxFsr2DestroyPipelineFunc)( /// Anything else The operation failed. /// /// @ingroup FSR2 -typedef FfxErrorCode (*FfxFsr2ScheduleRenderJobFunc)( +typedef FfxErrorCode (*FfxFsr2ScheduleGpuJobFunc)( FfxFsr2Interface* backendInterface, - const FfxRenderJobDescription* job); + const FfxGpuJobDescription* job); /// Execute scheduled render jobs on the comandList provided. /// /// The recording of the graphics API commands should take place in this /// callback function, the render jobs which were previously enqueued (via -/// callbacks made to FfxFsr2ScheduleRenderJobFunc) should be +/// callbacks made to FfxFsr2ScheduleGpuJobFunc) should be /// processed in the order they were received. Advanced users might choose to /// reorder the rendering jobs, but should do so with care to respect the /// resource dependencies. @@ -348,7 +309,7 @@ typedef FfxErrorCode (*FfxFsr2ScheduleRenderJobFunc)( /// Anything else The operation failed. /// /// @ingroup FSR2 -typedef FfxErrorCode (*FfxFsr2ExecuteRenderJobsFunc)( +typedef FfxErrorCode (*FfxFsr2ExecuteGpuJobsFunc)( FfxFsr2Interface* backendInterface, FfxCommandList commandList); @@ -370,8 +331,8 @@ typedef FfxErrorCode (*FfxFsr2ExecuteRenderJobsFunc)( /// FfxFsr2DestroyResourceFunc /// FfxFsr2CreatePipelineFunc /// FfxFsr2DestroyPipelineFunc -/// FfxFsr2ScheduleRenderJobFunc -/// FfxFsr2ExecuteRenderJobsFunc +/// FfxFsr2ScheduleGpuJobFunc +/// FfxFsr2ExecuteGpuJobsFunc /// /// Depending on the graphics API that is abstracted by the backend, it may be /// required that the backend is to some extent stateful. To ensure that @@ -393,9 +354,9 @@ typedef FfxErrorCode (*FfxFsr2ExecuteRenderJobsFunc)( /// @ingroup FSR2 typedef struct FfxFsr2Interface { - FfxFsr2CreateDeviceFunc fpCreateDevice; ///< A callback function to create (or reference) a device. + FfxFsr2CreateBackendContextFunc fpCreateBackendContext; ///< A callback function to create and initialize the backend context. FfxFsr2GetDeviceCapabilitiesFunc fpGetDeviceCapabilities; ///< A callback function to query device capabilites. - FfxFsr2DestroyDeviceFunc fpDestroyDevice; ///< A callback function to destroy (or dereference) a device. + FfxFsr2DestroyBackendContextFunc fpDestroyBackendContext; ///< A callback function to destroy the backendcontext. This also dereferences the device. FfxFsr2CreateResourceFunc fpCreateResource; ///< A callback function to create a resource. FfxFsr2RegisterResourceFunc fpRegisterResource; ///< A callback function to register an external resource. FfxFsr2UnregisterResourcesFunc fpUnregisterResources; ///< A callback function to unregister external resource. @@ -403,8 +364,8 @@ typedef struct FfxFsr2Interface { FfxFsr2DestroyResourceFunc fpDestroyResource; ///< A callback function to destroy a resource. FfxFsr2CreatePipelineFunc fpCreatePipeline; ///< A callback function to create a render or compute pipeline. FfxFsr2DestroyPipelineFunc fpDestroyPipeline; ///< A callback function to destroy a render or compute pipeline. - FfxFsr2ScheduleRenderJobFunc fpScheduleRenderJob; ///< A callback function to schedule a render job. - FfxFsr2ExecuteRenderJobsFunc fpExecuteRenderJobs; ///< A callback function to execute all queued render jobs. + FfxFsr2ScheduleGpuJobFunc fpScheduleGpuJob; ///< A callback function to schedule a render job. + FfxFsr2ExecuteGpuJobsFunc fpExecuteGpuJobs; ///< A callback function to execute all queued render jobs. void* scratchBuffer; ///< A preallocated buffer for memory utilized internally by the backend. size_t scratchBufferSize; ///< Size of the buffer pointed to by scratchBuffer. diff --git a/src/ffx-fsr2-api/ffx_fsr2_maximum_bias.h b/src/ffx-fsr2-api/ffx_fsr2_maximum_bias.h index 2058fef..ad840f3 100644 --- a/src/ffx-fsr2-api/ffx_fsr2_maximum_bias.h +++ b/src/ffx-fsr2-api/ffx_fsr2_maximum_bias.h @@ -23,8 +23,8 @@ #pragma once -static const int32_t FFX_FSR2_MAXIMUM_BIAS_TEXTURE_WIDTH = 16; -static const int32_t FFX_FSR2_MAXIMUM_BIAS_TEXTURE_HEIGHT = 16; +static const int FFX_FSR2_MAXIMUM_BIAS_TEXTURE_WIDTH = 16; +static const int FFX_FSR2_MAXIMUM_BIAS_TEXTURE_HEIGHT = 16; static const float ffxFsr2MaximumBias[] = { 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 1.876f, 1.809f, 1.772f, 1.753f, 1.748f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 1.869f, 1.801f, 1.764f, 1.745f, 1.739f, diff --git a/src/ffx-fsr2-api/ffx_types.h b/src/ffx-fsr2-api/ffx_types.h index dcaa866..75fb0e8 100644 --- a/src/ffx-fsr2-api/ffx_types.h +++ b/src/ffx-fsr2-api/ffx_types.h @@ -43,6 +43,9 @@ /// Maximum size of bound constant buffers. #define FFX_MAX_CONST_SIZE 64 +/// Off by default warnings +#pragma warning(disable : 4365 4710 4820 5039) + #ifdef __cplusplus extern "C" { #endif // #ifdef __cplusplus @@ -66,6 +69,7 @@ typedef enum FfxSurfaceFormat { FFX_SURFACE_FORMAT_R16_UNORM, ///< 16 bit per channel, 1 channel unsigned normalized format FFX_SURFACE_FORMAT_R16_SNORM, ///< 16 bit per channel, 1 channel signed normalized format FFX_SURFACE_FORMAT_R8_UNORM, ///< 8 bit per channel, 1 channel unsigned normalized format + FFX_SURFACE_FORMAT_R8G8_UNORM, ///< 8 bit per channel, 2 channel unsigned normalized format FFX_SURFACE_FORMAT_R32_FLOAT ///< 32 bit per channel, 1 channel float format } FfxSurfaceFormat; @@ -146,12 +150,12 @@ typedef enum FfxHeapType { } FfxHeapType; /// An enumberation for different render job types -typedef enum FfxRenderJobType { +typedef enum FfxGpuJobType { - FFX_RENDER_JOB_CLEAR_FLOAT = 0, ///< The render job is performing a floating-point clear. - FFX_RENDER_JOB_COPY = 1, ///< The render job is performing a copy. - FFX_RENDER_JOB_COMPUTE = 2, ///< The render job is performing a compute dispatch. -} FfxRenderJobType; + FFX_GPU_JOB_CLEAR_FLOAT = 0, ///< The GPU job is performing a floating-point clear. + FFX_GPU_JOB_COPY = 1, ///< The GPU job is performing a copy. + FFX_GPU_JOB_COMPUTE = 2, ///< The GPU job is performing a compute dispatch. +} FfxGpuJobType; /// A typedef representing the graphics device. typedef void* FfxDevice; @@ -211,9 +215,7 @@ typedef struct FfxResourceDescription { /// An outward facing structure containing a resource typedef struct FfxResource { void* resource; ///< pointer to the resource. -#ifdef _DEBUG wchar_t name[64]; -#endif FfxResourceDescription description; FfxResourceStates state; bool isDepth; @@ -225,12 +227,13 @@ typedef struct FfxResourceInternal { int32_t internalIndex; ///< The index of the resource. } FfxResourceInternal; + /// A structure defining a resource bind point typedef struct FfxResourceBinding { uint32_t slotIndex; uint32_t resourceIdentifier; - char name[64]; + wchar_t name[64]; }FfxResourceBinding; /// A structure encapsulating a single pass of an algorithm. @@ -250,7 +253,6 @@ typedef struct FfxPipelineState { /// A structure containing the data required to create a resource. typedef struct FfxCreateResourceDescription { - FfxDevice device; ///< The FfxDevice. FfxHeapType heapType; ///< The heap type to hold the resource, typically FFX_HEAP_TYPE_DEFAULT. FfxResourceDescription resourceDescription; ///< A resource description. FfxResourceStates initalState; ///< The initial resource state. @@ -261,6 +263,42 @@ typedef struct FfxCreateResourceDescription { uint32_t id; ///< Internal resource ID. } FfxCreateResourceDescription; +/// A structure containing the description used to create a +/// FfxPipeline structure. +/// +/// A pipeline is the name given to a shader and the collection of state that +/// is required to dispatch it. In the context of FSR2 and its architecture +/// this means that a FfxPipelineDescription will map to either a +/// monolithic object in an explicit API (such as a +/// PipelineStateObject in DirectX 12). Or a shader and some +/// ancillary API objects (in something like DirectX 11). +/// +/// The contextFlags field contains a copy of the flags passed +/// to ffxFsr2ContextCreate via the flags field of +/// the FfxFsr2InitializationParams structure. These flags are +/// used to determine which permutation of a pipeline for a specific +/// FfxFsr2Pass should be used to implement the features required +/// by each application, as well as to acheive the best performance on specific +/// target hardware configurations. +/// +/// When using one of the provided backends for FSR2 (such as DirectX 12 or +/// Vulkan) the data required to create a pipeline is compiled offline and +/// included into the backend library that you are using. For cases where the +/// backend interface is overriden by providing custom callback function +/// implementations care should be taken to respect the contents of the +/// contextFlags field in order to correctly support the options +/// provided by FSR2, and acheive best performance. +/// +/// @ingroup FSR2 +typedef struct FfxPipelineDescription { + + uint32_t contextFlags; ///< A collection of FfxFsr2InitializationFlagBits which were passed to the context. + FfxFilterType* samplers; ///< Array of static samplers. + size_t samplerCount; ///< The number of samples contained inside samplers. + const uint32_t* rootConstantBufferSizes; ///< Array containing the sizes of the root constant buffers (count of 32 bit elements). + uint32_t rootConstantBufferCount; ///< The number of root constants contained within rootConstantBufferSizes. +} FfxPipelineDescription; + /// A structure containing a constant buffer. typedef struct FfxConstantBuffer { @@ -281,12 +319,12 @@ typedef struct FfxComputeJobDescription { FfxPipelineState pipeline; ///< Compute pipeline for the render job. uint32_t dimensions[3]; ///< Dispatch dimensions. FfxResourceInternal srvs[FFX_MAX_NUM_SRVS]; ///< SRV resources to be bound in the compute job. - char srvNames[FFX_MAX_NUM_SRVS][64]; + wchar_t srvNames[FFX_MAX_NUM_SRVS][64]; FfxResourceInternal uavs[FFX_MAX_NUM_UAVS]; ///< UAV resources to be bound in the compute job. uint32_t uavMip[FFX_MAX_NUM_UAVS]; ///< Mip level of UAV resources to be bound in the compute job. - char uavNames[FFX_MAX_NUM_UAVS][64]; + wchar_t uavNames[FFX_MAX_NUM_UAVS][64]; FfxConstantBuffer cbs[FFX_MAX_NUM_CONST_BUFFERS]; ///< Constant buffers to be bound in the compute job. - char cbNames[FFX_MAX_NUM_CONST_BUFFERS][64]; + wchar_t cbNames[FFX_MAX_NUM_CONST_BUFFERS][64]; } FfxComputeJobDescription; /// A structure describing a copy render job. @@ -297,16 +335,16 @@ typedef struct FfxCopyJobDescription } FfxCopyJobDescription; /// A structure describing a single render job. -typedef struct FfxRenderJobDescription { +typedef struct FfxGpuJobDescription{ - FfxRenderJobType jobType; ///< Type of the render job. + FfxGpuJobType jobType; ///< Type of the job. union { - FfxClearFloatJobDescription clearJobDescriptor; ///< Render job descriptor. Valid when jobType is FFX_RENDER_JOB_CLEAR_FLOAT. - FfxCopyJobDescription copyJobDescriptor; ///< Render job descriptor. Valid when jobType is FFX_RENDER_JOB_COPY. - FfxComputeJobDescription computeJobDescriptor; ///< Render job descriptor. Valid when jobType is FFX_RENDER_JOB_COMPUTE. + FfxClearFloatJobDescription clearJobDescriptor; ///< Clear job descriptor. Valid when jobType is FFX_RENDER_JOB_CLEAR_FLOAT. + FfxCopyJobDescription copyJobDescriptor; ///< Copy job descriptor. Valid when jobType is FFX_RENDER_JOB_COPY. + FfxComputeJobDescription computeJobDescriptor; ///< Compute job descriptor. Valid when jobType is FFX_RENDER_JOB_COMPUTE. }; -} FfxRenderJobDescription; +} FfxGpuJobDescription; #ifdef __cplusplus } diff --git a/src/ffx-fsr2-api/shaders/ffx_common_types.h b/src/ffx-fsr2-api/shaders/ffx_common_types.h index abcb979..cf6ba99 100644 --- a/src/ffx-fsr2-api/shaders/ffx_common_types.h +++ b/src/ffx-fsr2-api/shaders/ffx_common_types.h @@ -18,7 +18,6 @@ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. - #ifndef FFX_COMMON_TYPES_H #define FFX_COMMON_TYPES_H @@ -246,18 +245,22 @@ typedef min16int4 FfxInt16x4; #if FFX_HALF -#define FFX_MIN16_SCALAR( TypeName, BaseComponentType ) typedef min16##BaseComponentType TypeName; -#define FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL ) typedef vector TypeName; -#define FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; - #if FFX_HLSL_6_2 +#define FFX_MIN16_SCALAR( TypeName, BaseComponentType ) typedef BaseComponentType##16_t TypeName; +#define FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL ) typedef vector TypeName; +#define FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; + #define FFX_16BIT_SCALAR( TypeName, BaseComponentType ) typedef BaseComponentType##16_t TypeName; #define FFX_16BIT_VECTOR( TypeName, BaseComponentType, COL ) typedef vector TypeName; #define FFX_16BIT_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; #else //FFX_HLSL_6_2 +#define FFX_MIN16_SCALAR( TypeName, BaseComponentType ) typedef min16##BaseComponentType TypeName; +#define FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL ) typedef vector TypeName; +#define FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; + #define FFX_16BIT_SCALAR( TypeName, BaseComponentType ) FFX_MIN16_SCALAR( TypeName, BaseComponentType ); #define FFX_16BIT_VECTOR( TypeName, BaseComponentType, COL ) FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL ); #define FFX_16BIT_MATRIX( TypeName, BaseComponentType, ROW, COL ) FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ); diff --git a/src/ffx-fsr2-api/shaders/ffx_core_cpu.h b/src/ffx-fsr2-api/shaders/ffx_core_cpu.h index b753459..9bb9915 100644 --- a/src/ffx-fsr2-api/shaders/ffx_core_cpu.h +++ b/src/ffx-fsr2-api/shaders/ffx_core_cpu.h @@ -36,6 +36,10 @@ #define FFX_STATIC static #endif // #if !defined(FFX_STATIC) +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wunused-variable" +#endif + /// Interpret the bit layout of an IEEE-754 floating point value as an unsigned integer. /// /// @param [in] x A 32bit floating value. diff --git a/src/ffx-fsr2-api/shaders/ffx_core_hlsl.h b/src/ffx-fsr2-api/shaders/ffx_core_hlsl.h index e1db5e3..f114687 100644 --- a/src/ffx-fsr2-api/shaders/ffx_core_hlsl.h +++ b/src/ffx-fsr2-api/shaders/ffx_core_hlsl.h @@ -1066,405 +1066,274 @@ FfxUInt32 AShrSU1(FfxUInt32 a, FfxUInt32 b) //============================================================================================================================== // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ -FfxFloat16x2 ffxUint32ToFloat16x2(FfxUInt32 x) +FFX_MIN16_F2 ffxUint32ToFloat16x2(FfxUInt32 x) { - FfxFloat32x2 t = f16tof32(FfxUInt32x2(x & 0xFFFF, x >> 16)); - return FfxFloat16x2(t); + FfxFloat32x2 t = f16tof32(FfxUInt32x2(x & 0xFFFF, x >> 16)); + return FFX_MIN16_F2(t); } -FfxFloat16x4 ffxUint32x2ToFloat16x4(FfxUInt32x2 x) +FFX_MIN16_F4 ffxUint32x2ToFloat16x4(FfxUInt32x2 x) { - return FfxFloat16x4(ffxUint32ToFloat16x2(x.x), ffxUint32ToFloat16x2(x.y)); + return FFX_MIN16_F4(ffxUint32ToFloat16x2(x.x), ffxUint32ToFloat16x2(x.y)); } -FfxUInt16x2 ffxUint32ToUint16x2(FfxUInt32 x) +FFX_MIN16_U2 ffxUint32ToUint16x2(FfxUInt32 x) { - FfxUInt32x2 t = FfxUInt32x2(x & 0xFFFF, x >> 16); - return FfxUInt16x2(t); + FfxUInt32x2 t = FfxUInt32x2(x & 0xFFFF, x >> 16); + return FFX_MIN16_U2(t); } -FfxUInt16x4 ffxUint32x2ToUint16x4(FfxUInt32x2 x) +FFX_MIN16_U4 ffxUint32x2ToUint16x4(FfxUInt32x2 x) { - return FfxUInt16x4(ffxUint32ToUint16x2(x.x), ffxUint32ToUint16x2(x.y)); + return FFX_MIN16_U4(ffxUint32ToUint16x2(x.x), ffxUint32ToUint16x2(x.y)); } #define FFX_UINT32_TO_FLOAT16X2(x) ffxUint32ToFloat16x2(FfxUInt32(x)) #define FFX_UINT32X2_TO_FLOAT16X4(x) ffxUint32x2ToFloat16x4(FfxUInt32x2(x)) #define FFX_UINT32_TO_UINT16X2(x) ffxUint32ToUint16x2(FfxUInt32(x)) #define FFX_UINT32X2_TO_UINT16X4(x) ffxUint32x2ToUint16x4(FfxUInt32x2(x)) //------------------------------------------------------------------------------------------------------------------------------ -FfxUInt32 ffxFloat16x2ToUint32(FfxFloat16x2 x) +FfxUInt32 FFX_MIN16_F2ToUint32(FFX_MIN16_F2 x) { - return f32tof16(x.x) + (f32tof16(x.y) << 16); + return f32tof16(x.x) + (f32tof16(x.y) << 16); } -FfxUInt32x2 ffxFloat16x4ToUint32x2(FfxFloat16x4 x) +FfxUInt32x2 FFX_MIN16_F4ToUint32x2(FFX_MIN16_F4 x) { - return FfxUInt32x2(ffxFloat16x2ToUint32(x.xy), ffxFloat16x2ToUint32(x.zw)); + return FfxUInt32x2(FFX_MIN16_F2ToUint32(x.xy), FFX_MIN16_F2ToUint32(x.zw)); } -FfxUInt32 ffxUint16x2ToUint32(FfxUInt16x2 x) +FfxUInt32 FFX_MIN16_U2ToUint32(FFX_MIN16_U2 x) { - return FfxUInt32(x.x) + (FfxUInt32(x.y) << 16); + return FfxUInt32(x.x) + (FfxUInt32(x.y) << 16); } -FfxUInt32x2 ffxUint16x4ToUint32x2(FfxUInt16x4 x) +FfxUInt32x2 FFX_MIN16_U4ToUint32x2(FFX_MIN16_U4 x) { - return FfxUInt32x2(ffxUint16x2ToUint32(x.xy), ffxUint16x2ToUint32(x.zw)); + return FfxUInt32x2(FFX_MIN16_U2ToUint32(x.xy), FFX_MIN16_U2ToUint32(x.zw)); } -#define FFX_FLOAT16X2_TO_UINT32(x) ffxFloat16x2ToUint32(FfxFloat16x2(x)) -#define FFX_FLOAT16X4_TO_UINT32X2(x) ffxFloat16x4ToUint32x2(FfxFloat16x4(x)) -#define FFX_UINT16X2_TO_UINT32(x) ffxUint16x2ToUint32(FfxUInt16x2(x)) -#define FFX_UINT16X4_TO_UINT32X2(x) ffxUint16x4ToUint32x2(FfxUInt16x4(x)) +#define FFX_FLOAT16X2_TO_UINT32(x) FFX_MIN16_F2ToUint32(FFX_MIN16_F2(x)) +#define FFX_FLOAT16X4_TO_UINT32X2(x) FFX_MIN16_F4ToUint32x2(FFX_MIN16_F4(x)) +#define FFX_UINT16X2_TO_UINT32(x) FFX_MIN16_U2ToUint32(FFX_MIN16_U2(x)) +#define FFX_UINT16X4_TO_UINT32X2(x) FFX_MIN16_U4ToUint32x2(FFX_MIN16_U4(x)) #if defined(FFX_HLSL_6_2) && !defined(FFX_NO_16_BIT_CAST) - #define FFX_TO_UINT16(x) asuint16(x) - #define FFX_TO_UINT16X2(x) asuint16(x) - #define FFX_TO_UINT16X3(x) asuint16(x) - #define FFX_TO_UINT16X4(x) asuint16(x) +#define FFX_TO_UINT16(x) asuint16(x) +#define FFX_TO_UINT16X2(x) asuint16(x) +#define FFX_TO_UINT16X3(x) asuint16(x) +#define FFX_TO_UINT16X4(x) asuint16(x) #else - #define FFX_TO_UINT16(a) FfxUInt16(f32tof16(FfxFloat32(a))) - #define FFX_TO_UINT16X2(a) FfxUInt16x2(FFX_TO_UINT16((a).x), FFX_TO_UINT16((a).y)) - #define FFX_TO_UINT16X3(a) FfxUInt16x3(FFX_TO_UINT16((a).x), FFX_TO_UINT16((a).y), FFX_TO_UINT16((a).z)) - #define FFX_TO_UINT16X4(a) FfxUInt16x4(FFX_TO_UINT16((a).x), FFX_TO_UINT16((a).y), FFX_TO_UINT16((a).z), FFX_TO_UINT16((a).w)) +#define FFX_TO_UINT16(a) FFX_MIN16_U(f32tof16(FfxFloat32(a))) +#define FFX_TO_UINT16X2(a) FFX_MIN16_U2(FFX_TO_UINT16((a).x), FFX_TO_UINT16((a).y)) +#define FFX_TO_UINT16X3(a) FFX_MIN16_U3(FFX_TO_UINT16((a).x), FFX_TO_UINT16((a).y), FFX_TO_UINT16((a).z)) +#define FFX_TO_UINT16X4(a) FFX_MIN16_U4(FFX_TO_UINT16((a).x), FFX_TO_UINT16((a).y), FFX_TO_UINT16((a).z), FFX_TO_UINT16((a).w)) #endif // #if defined(FFX_HLSL_6_2) && !defined(FFX_NO_16_BIT_CAST) #if defined(FFX_HLSL_6_2) && !defined(FFX_NO_16_BIT_CAST) - #define FFX_TO_FLOAT16(x) asfloat16(x) - #define FFX_TO_FLOAT16X2(x) asfloat16(x) - #define FFX_TO_FLOAT16X3(x) asfloat16(x) - #define FFX_TO_FLOAT16X4(x) asfloat16(x) +#define FFX_TO_FLOAT16(x) asfloat16(x) +#define FFX_TO_FLOAT16X2(x) asfloat16(x) +#define FFX_TO_FLOAT16X3(x) asfloat16(x) +#define FFX_TO_FLOAT16X4(x) asfloat16(x) #else - #define FFX_TO_FLOAT16(a) FfxFloat16(f16tof32(FfxUInt32(a))) - #define FFX_TO_FLOAT16X2(a) FfxFloat16x2(FFX_TO_FLOAT16((a).x), FFX_TO_FLOAT16((a).y)) - #define FFX_TO_FLOAT16X3(a) FfxFloat16x3(FFX_TO_FLOAT16((a).x), FFX_TO_FLOAT16((a).y), FFX_TO_FLOAT16((a).z)) - #define FFX_TO_FLOAT16X4(a) FfxFloat16x4(FFX_TO_FLOAT16((a).x), FFX_TO_FLOAT16((a).y), FFX_TO_FLOAT16((a).z), FFX_TO_FLOAT16((a).w)) +#define FFX_TO_FLOAT16(a) FFX_MIN16_F(f16tof32(FfxUInt32(a))) +#define FFX_TO_FLOAT16X2(a) FFX_MIN16_F2(FFX_TO_FLOAT16((a).x), FFX_TO_FLOAT16((a).y)) +#define FFX_TO_FLOAT16X3(a) FFX_MIN16_F3(FFX_TO_FLOAT16((a).x), FFX_TO_FLOAT16((a).y), FFX_TO_FLOAT16((a).z)) +#define FFX_TO_FLOAT16X4(a) FFX_MIN16_F4(FFX_TO_FLOAT16((a).x), FFX_TO_FLOAT16((a).y), FFX_TO_FLOAT16((a).z), FFX_TO_FLOAT16((a).w)) #endif // #if defined(FFX_HLSL_6_2) && !defined(FFX_NO_16_BIT_CAST) //============================================================================================================================== -#if FFX_HLSL_6_2 -FfxFloat16 ffxBroadcastFloat16(FfxFloat16 a) -{ - return FfxFloat16(a); -} -FfxFloat16x2 ffxBroadcastFloat16x2(FfxFloat16 a) -{ - return FfxFloat16x2(a, a); -} -FfxFloat16x3 ffxBroadcastFloat16x3(FfxFloat16 a) -{ - return FfxFloat16x3(a, a, a); -} -FfxFloat16x4 ffxBroadcastFloat16x4(FfxFloat16 a) -{ - return FfxFloat16x4(a, a, a, a); -} -#define FFX_BROADCAST_FLOAT16(a) FfxFloat16(a) -#define FFX_BROADCAST_FLOAT16X2(a) FfxFloat16(a) -#define FFX_BROADCAST_FLOAT16X3(a) FfxFloat16(a) -#define FFX_BROADCAST_FLOAT16X4(a) FfxFloat16(a) -#else #define FFX_BROADCAST_FLOAT16(a) FFX_MIN16_F(a) #define FFX_BROADCAST_FLOAT16X2(a) FFX_MIN16_F(a) #define FFX_BROADCAST_FLOAT16X3(a) FFX_MIN16_F(a) #define FFX_BROADCAST_FLOAT16X4(a) FFX_MIN16_F(a) -#endif + //------------------------------------------------------------------------------------------------------------------------------ -#if FFX_HLSL_6_2 -FfxInt16 ffxBroadcastInt16(FfxInt16 a) -{ - return FfxInt16(a); -} -FfxInt16x2 ffxBroadcastInt16x2(FfxInt16 a) -{ - return FfxInt16x2(a, a); -} -FfxInt16x3 ffxBroadcastInt16x3(FfxInt16 a) -{ - return FfxInt16x3(a, a, a); -} -FfxInt16x4 ffxBroadcastInt16x4(FfxInt16 a) -{ - return FfxInt16x4(a, a, a, a); -} -#define FFX_BROADCAST_INT16(a) FfxInt16(a) -#define FFX_BROADCAST_INT16X2(a) FfxInt16(a) -#define FFX_BROADCAST_INT16X3(a) FfxInt16(a) -#define FFX_BROADCAST_INT16X4(a) FfxInt16(a) -#else #define FFX_BROADCAST_INT16(a) FFX_MIN16_I(a) #define FFX_BROADCAST_INT16X2(a) FFX_MIN16_I(a) #define FFX_BROADCAST_INT16X3(a) FFX_MIN16_I(a) #define FFX_BROADCAST_INT16X4(a) FFX_MIN16_I(a) -#endif + //------------------------------------------------------------------------------------------------------------------------------ -#if FFX_HLSL_6_2 -FfxUInt16 ffxBroadcastUInt16(FfxUInt16 a) -{ - return FfxUInt16(a); -} -FfxUInt16x2 ffxBroadcastUInt16x2(FfxUInt16 a) -{ - return FfxUInt16x2(a, a); -} -FfxUInt16x3 ffxBroadcastUInt16x3(FfxUInt16 a) -{ - return FfxUInt16x3(a, a, a); -} -FfxUInt16x4 ffxBroadcastUInt16x4(FfxUInt16 a) -{ - return FfxUInt16x4(a, a, a, a); -} -#define FFX_BROADCAST_UINT16(a) FfxUInt16(a) -#define FFX_BROADCAST_UINT16X2(a) FfxUInt16(a) -#define FFX_BROADCAST_UINT16X3(a) FfxUInt16(a) -#define FFX_BROADCAST_UINT16X4(a) FfxUInt16(a) -#else #define FFX_BROADCAST_UINT16(a) FFX_MIN16_U(a) #define FFX_BROADCAST_UINT16X2(a) FFX_MIN16_U(a) #define FFX_BROADCAST_UINT16X3(a) FFX_MIN16_U(a) #define FFX_BROADCAST_UINT16X4(a) FFX_MIN16_U(a) -#endif + //============================================================================================================================== -FfxUInt16 ffxAbsHalf(FfxUInt16 a) +FFX_MIN16_U ffxAbsHalf(FFX_MIN16_U a) { - return FfxUInt16(abs(FfxInt16(a))); + return FFX_MIN16_U(abs(FFX_MIN16_I(a))); } -FfxUInt16x2 ffxAbsHalf(FfxUInt16x2 a) +FFX_MIN16_U2 ffxAbsHalf(FFX_MIN16_U2 a) { - return FfxUInt16x2(abs(FfxInt16x2(a))); + return FFX_MIN16_U2(abs(FFX_MIN16_I2(a))); } -FfxUInt16x3 ffxAbsHalf(FfxUInt16x3 a) +FFX_MIN16_U3 ffxAbsHalf(FFX_MIN16_U3 a) { - return FfxUInt16x3(abs(FfxInt16x3(a))); + return FFX_MIN16_U3(abs(FFX_MIN16_I3(a))); } -FfxUInt16x4 ffxAbsHalf(FfxUInt16x4 a) +FFX_MIN16_U4 ffxAbsHalf(FFX_MIN16_U4 a) { - return FfxUInt16x4(abs(FfxInt16x4(a))); + return FFX_MIN16_U4(abs(FFX_MIN16_I4(a))); } //------------------------------------------------------------------------------------------------------------------------------ -FfxFloat16 ffxClampHalf(FfxFloat16 x, FfxFloat16 n, FfxFloat16 m) +FFX_MIN16_F ffxClampHalf(FFX_MIN16_F x, FFX_MIN16_F n, FFX_MIN16_F m) { - return max(n, min(x, m)); + return max(n, min(x, m)); } -FfxFloat16x2 ffxClampHalf(FfxFloat16x2 x, FfxFloat16x2 n, FfxFloat16x2 m) +FFX_MIN16_F2 ffxClampHalf(FFX_MIN16_F2 x, FFX_MIN16_F2 n, FFX_MIN16_F2 m) { - return max(n, min(x, m)); + return max(n, min(x, m)); } -FfxFloat16x3 ffxClampHalf(FfxFloat16x3 x, FfxFloat16x3 n, FfxFloat16x3 m) +FFX_MIN16_F3 ffxClampHalf(FFX_MIN16_F3 x, FFX_MIN16_F3 n, FFX_MIN16_F3 m) { - return max(n, min(x, m)); + return max(n, min(x, m)); } -FfxFloat16x4 ffxClampHalf(FfxFloat16x4 x, FfxFloat16x4 n, FfxFloat16x4 m) +FFX_MIN16_F4 ffxClampHalf(FFX_MIN16_F4 x, FFX_MIN16_F4 n, FFX_MIN16_F4 m) { - return max(n, min(x, m)); + return max(n, min(x, m)); } //------------------------------------------------------------------------------------------------------------------------------ // V_FRACT_F16 (note DX frac() is different). -FfxFloat16 ffxFract(FfxFloat16 x) +FFX_MIN16_F ffxFract(FFX_MIN16_F x) { - return x - floor(x); + return x - floor(x); } -FfxFloat16x2 ffxFract(FfxFloat16x2 x) +FFX_MIN16_F2 ffxFract(FFX_MIN16_F2 x) { - return x - floor(x); + return x - floor(x); } -FfxFloat16x3 ffxFract(FfxFloat16x3 x) +FFX_MIN16_F3 ffxFract(FFX_MIN16_F3 x) { - return x - floor(x); + return x - floor(x); } -FfxFloat16x4 ffxFract(FfxFloat16x4 x) +FFX_MIN16_F4 ffxFract(FFX_MIN16_F4 x) { - return x - floor(x); + return x - floor(x); } //------------------------------------------------------------------------------------------------------------------------------ -FfxFloat16 ffxLerp(FfxFloat16 x, FfxFloat16 y, FfxFloat16 a) +FFX_MIN16_F ffxLerp(FFX_MIN16_F x, FFX_MIN16_F y, FFX_MIN16_F a) { - return lerp(x, y, a); + return lerp(x, y, a); } -FfxFloat16x2 ffxLerp(FfxFloat16x2 x, FfxFloat16x2 y, FfxFloat16 a) +FFX_MIN16_F2 ffxLerp(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F a) { - return lerp(x, y, a); + return lerp(x, y, a); } -FfxFloat16x2 ffxLerp(FfxFloat16x2 x, FfxFloat16x2 y, FfxFloat16x2 a) +FFX_MIN16_F2 ffxLerp(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F2 a) { - return lerp(x, y, a); + return lerp(x, y, a); } -FfxFloat16x3 ffxLerp(FfxFloat16x3 x, FfxFloat16x3 y, FfxFloat16 a) +FFX_MIN16_F3 ffxLerp(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F a) { - return lerp(x, y, a); + return lerp(x, y, a); } -FfxFloat16x3 ffxLerp(FfxFloat16x3 x, FfxFloat16x3 y, FfxFloat16x3 a) +FFX_MIN16_F3 ffxLerp(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F3 a) { - return lerp(x, y, a); + return lerp(x, y, a); } -FfxFloat16x4 ffxLerp(FfxFloat16x4 x, FfxFloat16x4 y, FfxFloat16 a) +FFX_MIN16_F4 ffxLerp(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F a) { - return lerp(x, y, a); + return lerp(x, y, a); } -FfxFloat16x4 ffxLerp(FfxFloat16x4 x, FfxFloat16x4 y, FfxFloat16x4 a) +FFX_MIN16_F4 ffxLerp(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F4 a) { - return lerp(x, y, a); + return lerp(x, y, a); } //------------------------------------------------------------------------------------------------------------------------------ -#if FFX_HLSL_6_2 -FFX_MIN16_F ffxLerp(FFX_MIN16_F x, FFX_MIN16_F y, FFX_MIN16_F t) +FFX_MIN16_F ffxMax3Half(FFX_MIN16_F x, FFX_MIN16_F y, FFX_MIN16_F z) { - return lerp(x, y, t); + return max(x, max(y, z)); } -FFX_MIN16_F2 ffxLerp(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F t) +FFX_MIN16_F2 ffxMax3Half(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F2 z) { - return lerp(x, y, t); + return max(x, max(y, z)); } -FFX_MIN16_F2 ffxLerp(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F2 t) +FFX_MIN16_F3 ffxMax3Half(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F3 z) { - return lerp(x, y, t); + return max(x, max(y, z)); } -FFX_MIN16_F3 ffxLerp(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F t) +FFX_MIN16_F4 ffxMax3Half(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F4 z) { - return lerp(x, y, t); -} -FFX_MIN16_F3 ffxLerp(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F3 t) -{ - return lerp(x, y, t); -} -FFX_MIN16_F4 ffxLerp(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F t) -{ - return lerp(x, y, t); -} -FFX_MIN16_F4 ffxLerp(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F4 t) -{ - return lerp(x, y, t); + return max(x, max(y, z)); } //------------------------------------------------------------------------------------------------------------------------------ -FFX_MIN16_F ffxMin(FFX_MIN16_F x, FFX_MIN16_F y) +FFX_MIN16_F ffxMin3Half(FFX_MIN16_F x, FFX_MIN16_F y, FFX_MIN16_F z) { - return min(x, y); + return min(x, min(y, z)); } -FFX_MIN16_F2 ffxMin(FFX_MIN16_F2 x, FFX_MIN16_F2 y) +FFX_MIN16_F2 ffxMin3Half(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F2 z) { - return min(x, y); + return min(x, min(y, z)); } -FFX_MIN16_F3 ffxMin(FFX_MIN16_F3 x, FFX_MIN16_F3 y) +FFX_MIN16_F3 ffxMin3Half(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F3 z) { - return min(x, y); + return min(x, min(y, z)); } -FFX_MIN16_F4 ffxMin(FFX_MIN16_F4 x, FFX_MIN16_F4 y) +FFX_MIN16_F4 ffxMin3Half(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F4 z) { - return min(x, y); + return min(x, min(y, z)); } //------------------------------------------------------------------------------------------------------------------------------ -FFX_MIN16_F ffxMax(FFX_MIN16_F x, FFX_MIN16_F y) +FFX_MIN16_F ffxReciprocalHalf(FFX_MIN16_F x) { - return max(x, y); + return rcp(x); } -FFX_MIN16_F2 ffxMax(FFX_MIN16_F2 x, FFX_MIN16_F2 y) +FFX_MIN16_F2 ffxReciprocalHalf(FFX_MIN16_F2 x) { - return max(x, y); + return rcp(x); } -FFX_MIN16_F3 ffxMax(FFX_MIN16_F3 x, FFX_MIN16_F3 y) +FFX_MIN16_F3 ffxReciprocalHalf(FFX_MIN16_F3 x) { - return max(x, y); + return rcp(x); } -FFX_MIN16_F4 ffxMax(FFX_MIN16_F4 x, FFX_MIN16_F4 y) +FFX_MIN16_F4 ffxReciprocalHalf(FFX_MIN16_F4 x) { - return max(x, y); -} -#endif -//------------------------------------------------------------------------------------------------------------------------------ -FfxFloat16 ffxMax3Half(FfxFloat16 x, FfxFloat16 y, FfxFloat16 z) -{ - return max(x, max(y, z)); -} -FfxFloat16x2 ffxMax3Half(FfxFloat16x2 x, FfxFloat16x2 y, FfxFloat16x2 z) -{ - return max(x, max(y, z)); -} -FfxFloat16x3 ffxMax3Half(FfxFloat16x3 x, FfxFloat16x3 y, FfxFloat16x3 z) -{ - return max(x, max(y, z)); -} -FfxFloat16x4 ffxMax3Half(FfxFloat16x4 x, FfxFloat16x4 y, FfxFloat16x4 z) -{ - return max(x, max(y, z)); + return rcp(x); } //------------------------------------------------------------------------------------------------------------------------------ -FfxFloat16 ffxMin3Half(FfxFloat16 x, FfxFloat16 y, FfxFloat16 z) +FFX_MIN16_F ffxReciprocalSquareRootHalf(FFX_MIN16_F x) { - return min(x, min(y, z)); + return rsqrt(x); } -FfxFloat16x2 ffxMin3Half(FfxFloat16x2 x, FfxFloat16x2 y, FfxFloat16x2 z) +FFX_MIN16_F2 ffxReciprocalSquareRootHalf(FFX_MIN16_F2 x) { - return min(x, min(y, z)); + return rsqrt(x); } -FfxFloat16x3 ffxMin3Half(FfxFloat16x3 x, FfxFloat16x3 y, FfxFloat16x3 z) +FFX_MIN16_F3 ffxReciprocalSquareRootHalf(FFX_MIN16_F3 x) { - return min(x, min(y, z)); + return rsqrt(x); } -FfxFloat16x4 ffxMin3Half(FfxFloat16x4 x, FfxFloat16x4 y, FfxFloat16x4 z) +FFX_MIN16_F4 ffxReciprocalSquareRootHalf(FFX_MIN16_F4 x) { - return min(x, min(y, z)); + return rsqrt(x); } //------------------------------------------------------------------------------------------------------------------------------ -FfxFloat16 ffxReciprocalHalf(FfxFloat16 x) +FFX_MIN16_F ffxSaturate(FFX_MIN16_F x) { - return rcp(x); + return saturate(x); } -FfxFloat16x2 ffxReciprocalHalf(FfxFloat16x2 x) +FFX_MIN16_F2 ffxSaturate(FFX_MIN16_F2 x) { - return rcp(x); + return saturate(x); } -FfxFloat16x3 ffxReciprocalHalf(FfxFloat16x3 x) +FFX_MIN16_F3 ffxSaturate(FFX_MIN16_F3 x) { - return rcp(x); + return saturate(x); } -FfxFloat16x4 ffxReciprocalHalf(FfxFloat16x4 x) +FFX_MIN16_F4 ffxSaturate(FFX_MIN16_F4 x) { - return rcp(x); + return saturate(x); } //------------------------------------------------------------------------------------------------------------------------------ -FfxFloat16 ffxReciprocalSquareRootHalf(FfxFloat16 x) +FFX_MIN16_U ffxBitShiftRightHalf(FFX_MIN16_U a, FFX_MIN16_U b) { - return rsqrt(x); + return FFX_MIN16_U(FFX_MIN16_I(a) >> FFX_MIN16_I(b)); } -FfxFloat16x2 ffxReciprocalSquareRootHalf(FfxFloat16x2 x) +FFX_MIN16_U2 ffxBitShiftRightHalf(FFX_MIN16_U2 a, FFX_MIN16_U2 b) { - return rsqrt(x); + return FFX_MIN16_U2(FFX_MIN16_I2(a) >> FFX_MIN16_I2(b)); } -FfxFloat16x3 ffxReciprocalSquareRootHalf(FfxFloat16x3 x) +FFX_MIN16_U3 ffxBitShiftRightHalf(FFX_MIN16_U3 a, FFX_MIN16_U3 b) { - return rsqrt(x); + return FFX_MIN16_U3(FFX_MIN16_I3(a) >> FFX_MIN16_I3(b)); } -FfxFloat16x4 ffxReciprocalSquareRootHalf(FfxFloat16x4 x) +FFX_MIN16_U4 ffxBitShiftRightHalf(FFX_MIN16_U4 a, FFX_MIN16_U4 b) { - return rsqrt(x); -} -//------------------------------------------------------------------------------------------------------------------------------ -FfxFloat16 ffxSaturate(FfxFloat16 x) -{ - return saturate(x); -} -FfxFloat16x2 ffxSaturate(FfxFloat16x2 x) -{ - return saturate(x); -} -FfxFloat16x3 ffxSaturate(FfxFloat16x3 x) -{ - return saturate(x); -} -FfxFloat16x4 ffxSaturate(FfxFloat16x4 x) -{ - return saturate(x); -} -//------------------------------------------------------------------------------------------------------------------------------ -FfxUInt16 ffxBitShiftRightHalf(FfxUInt16 a, FfxUInt16 b) -{ - return FfxUInt16(FfxInt16(a) >> FfxInt16(b)); -} -FfxUInt16x2 ffxBitShiftRightHalf(FfxUInt16x2 a, FfxUInt16x2 b) -{ - return FfxUInt16x2(FfxInt16x2(a) >> FfxInt16x2(b)); -} -FfxUInt16x3 ffxBitShiftRightHalf(FfxUInt16x3 a, FfxUInt16x3 b) -{ - return FfxUInt16x3(FfxInt16x3(a) >> FfxInt16x3(b)); -} -FfxUInt16x4 ffxBitShiftRightHalf(FfxUInt16x4 a, FfxUInt16x4 b) -{ - return FfxUInt16x4(FfxInt16x4(a) >> FfxInt16x4(b)); + return FFX_MIN16_U4(FFX_MIN16_I4(a) >> FFX_MIN16_I4(b)); } #endif // FFX_HALF diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr1.h b/src/ffx-fsr2-api/shaders/ffx_fsr1.h index 0636247..1ac23cf 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr1.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr1.h @@ -19,6 +19,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wunused-variable" +#endif + /// Setup required constant values for EASU (works on CPU or GPU). /// /// @param [out] con0 diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_accumulate.h b/src/ffx-fsr2-api/shaders/ffx_fsr2_accumulate.h index 83fd286..14620d5 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_accumulate.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_accumulate.h @@ -19,38 +19,24 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -FFX_MIN16_F4 WrapDepthClipMask(FFX_MIN16_I2 iPxSample) -{ - return FFX_MIN16_F4(LoadDepthClip(iPxSample).r, 0, 0, 0); -} +#ifndef FFX_FSR2_ACCUMULATE_H +#define FFX_FSR2_ACCUMULATE_H -DeclareCustomFetchBilinearSamples(FetchDepthClipMaskSamples, WrapDepthClipMask) -DeclareCustomTextureSample(DepthClipMaskSample, Bilinear, FetchDepthClipMaskSamples) - -FFX_MIN16_F4 WrapTransparencyAndCompositionMask(FFX_MIN16_I2 iPxSample) -{ - return FFX_MIN16_F4(LoadTransparencyAndCompositionMask(iPxSample).r, 0, 0, 0); -} - -DeclareCustomFetchBilinearSamples(FetchTransparencyAndCompositionMaskSamples, WrapTransparencyAndCompositionMask) -DeclareCustomTextureSample(TransparencyAndCompositionMaskSample, Bilinear, FetchTransparencyAndCompositionMaskSamples) - -FfxFloat32x4 WrapLumaStabilityFactor(FFX_MIN16_I2 iPxSample) -{ - return FfxFloat32x4(LoadLumaStabilityFactor(iPxSample), 0, 0, 0); -} - -DeclareCustomFetchBilinearSamples(FetchLumaStabilitySamples, WrapLumaStabilityFactor) -DeclareCustomTextureSample(LumaStabilityFactorSample, Bilinear, FetchLumaStabilitySamples) +#define FFX_FSR2_OPTION_GUARANTEE_UPSAMPLE_WEIGHT_ON_NEW_SAMPLES 1 FfxFloat32 GetPxHrVelocity(FfxFloat32x2 fMotionVector) { return length(fMotionVector * DisplaySize()); } - -void Accumulate(FFX_MIN16_I2 iPxHrPos, FFX_PARAMETER_INOUT FfxFloat32x4 fHistory, FFX_PARAMETER_IN FfxFloat32x4 fUpsampled, FFX_PARAMETER_IN FfxFloat32 fDepthClipFactor, FFX_PARAMETER_IN FfxFloat32 fHrVelocity) +#if FFX_HALF +FFX_MIN16_F GetPxHrVelocity(FFX_MIN16_F2 fMotionVector) { + return length(fMotionVector * FFX_MIN16_F2(DisplaySize())); +} +#endif +void Accumulate(FfxInt32x2 iPxHrPos, FFX_PARAMETER_INOUT FfxFloat32x4 fHistory, FFX_PARAMETER_IN FfxFloat32x4 fUpsampled, FFX_PARAMETER_IN FfxFloat32 fDepthClipFactor, FFX_PARAMETER_IN FfxFloat32 fHrVelocity) +{ fHistory.w = fHistory.w + fUpsampled.w; fUpsampled.rgb = YCoCgToRGB(fUpsampled.rgb); @@ -58,56 +44,56 @@ void Accumulate(FFX_MIN16_I2 iPxHrPos, FFX_PARAMETER_INOUT FfxFloat32x4 fHistory const FfxFloat32 fAlpha = fUpsampled.w / fHistory.w; fHistory.rgb = ffxLerp(fHistory.rgb, fUpsampled.rgb, fAlpha); - FfxFloat32 fMaxAverageWeight = ffxLerp(MaxAccumulationWeight(), accumulationMaxOnMotion, ffxSaturate(fHrVelocity * 10.0f)); + FfxFloat32 fMaxAverageWeight = FfxFloat32(ffxLerp(MaxAccumulationWeight(), accumulationMaxOnMotion, ffxSaturate(fHrVelocity * 10.0f))); fHistory.w = ffxMin(fHistory.w, fMaxAverageWeight); } void RectifyHistory( RectificationBoxData clippingBox, inout FfxFloat32x4 fHistory, - FFX_PARAMETER_IN LOCK_STATUS_T fLockStatus, - FFX_PARAMETER_IN UPSAMPLE_F fDepthClipFactor, - FFX_PARAMETER_IN UPSAMPLE_F fLumaStabilityFactor, - FFX_PARAMETER_IN UPSAMPLE_F fLuminanceDiff, - FFX_PARAMETER_IN UPSAMPLE_F fUpsampleWeight, + FFX_PARAMETER_IN FfxFloat32x3 fLockStatus, + FFX_PARAMETER_IN FfxFloat32 fDepthClipFactor, + FFX_PARAMETER_IN FfxFloat32 fLumaStabilityFactor, + FFX_PARAMETER_IN FfxFloat32 fLuminanceDiff, + FFX_PARAMETER_IN FfxFloat32 fUpsampleWeight, FFX_PARAMETER_IN FfxFloat32 fLockContributionThisFrame) { - UPSAMPLE_F fScaleFactorInfluence = UPSAMPLE_F(1.0f / DownscaleFactor().x - 1); - UPSAMPLE_F fBoxScale = UPSAMPLE_F(1.0f) + (UPSAMPLE_F(0.5f) * fScaleFactorInfluence); + FfxFloat32 fScaleFactorInfluence = FfxFloat32(1.0f / DownscaleFactor().x - 1); + FfxFloat32 fBoxScale = FfxFloat32(1.0f) + (FfxFloat32(0.5f) * fScaleFactorInfluence); - FFX_MIN16_F3 fScaledBoxVec = clippingBox.boxVec * fBoxScale; - UPSAMPLE_F3 boxMin = clippingBox.boxCenter - fScaledBoxVec; - UPSAMPLE_F3 boxMax = clippingBox.boxCenter + fScaledBoxVec; - UPSAMPLE_F3 boxCenter = clippingBox.boxCenter; - UPSAMPLE_F boxVecSize = length(clippingBox.boxVec); + FfxFloat32x3 fScaledBoxVec = clippingBox.boxVec * fBoxScale; + FfxFloat32x3 boxMin = clippingBox.boxCenter - fScaledBoxVec; + FfxFloat32x3 boxMax = clippingBox.boxCenter + fScaledBoxVec; + FfxFloat32x3 boxCenter = clippingBox.boxCenter; + FfxFloat32 boxVecSize = length(clippingBox.boxVec); boxMin = ffxMax(clippingBox.aabbMin, boxMin); boxMax = ffxMin(clippingBox.aabbMax, boxMax); - UPSAMPLE_F3 distToClampOutside = UPSAMPLE_F3(ffxMax(ffxMax(UPSAMPLE_F3_BROADCAST(0.0f), boxMin - UPSAMPLE_F3(fHistory.xyz)), ffxMax(UPSAMPLE_F3_BROADCAST(0.0f), UPSAMPLE_F3(fHistory.xyz) - boxMax))); + FfxFloat32x3 distToClampOutside = ffxMax(ffxMax(FfxFloat32x3(0, 0, 0), boxMin - fHistory.xyz), ffxMax(FfxFloat32x3(0, 0, 0), fHistory.xyz - boxMax)); - if (any(FFX_GREATER_THAN(distToClampOutside, UPSAMPLE_F3_BROADCAST(0.0f)))) { + if (any(FFX_GREATER_THAN(distToClampOutside, FfxFloat32x3(0, 0, 0)))) { - const UPSAMPLE_F3 clampedHistorySample = clamp(UPSAMPLE_F3(fHistory.xyz), boxMin, boxMax); + const FfxFloat32x3 clampedHistorySample = clamp(fHistory.xyz, boxMin, boxMax); - UPSAMPLE_F3 clippedHistoryToBoxCenter = abs(clampedHistorySample - boxCenter); - UPSAMPLE_F3 historyToBoxCenter = abs(UPSAMPLE_F3(fHistory.xyz) - boxCenter); - UPSAMPLE_F3 HistoryColorWeight; - HistoryColorWeight.x = historyToBoxCenter.x > UPSAMPLE_F(0) ? clippedHistoryToBoxCenter.x / historyToBoxCenter.x : UPSAMPLE_F(0.0f); - HistoryColorWeight.y = historyToBoxCenter.y > UPSAMPLE_F(0) ? clippedHistoryToBoxCenter.y / historyToBoxCenter.y : UPSAMPLE_F(0.0f); - HistoryColorWeight.z = historyToBoxCenter.z > UPSAMPLE_F(0) ? clippedHistoryToBoxCenter.z / historyToBoxCenter.z : UPSAMPLE_F(0.0f); + FfxFloat32x3 clippedHistoryToBoxCenter = abs(clampedHistorySample - boxCenter); + FfxFloat32x3 historyToBoxCenter = abs(fHistory.xyz - boxCenter); + FfxFloat32x3 HistoryColorWeight; + HistoryColorWeight.x = historyToBoxCenter.x > FfxFloat32(0) ? clippedHistoryToBoxCenter.x / historyToBoxCenter.x : FfxFloat32(0.0f); + HistoryColorWeight.y = historyToBoxCenter.y > FfxFloat32(0) ? clippedHistoryToBoxCenter.y / historyToBoxCenter.y : FfxFloat32(0.0f); + HistoryColorWeight.z = historyToBoxCenter.z > FfxFloat32(0) ? clippedHistoryToBoxCenter.z / historyToBoxCenter.z : FfxFloat32(0.0f); - UPSAMPLE_F3 fHistoryContribution = HistoryColorWeight; + FfxFloat32x3 fHistoryContribution = HistoryColorWeight; // only lock luma - fHistoryContribution += UPSAMPLE_F3_BROADCAST(ffxMax(UPSAMPLE_F(fLockContributionThisFrame), fLumaStabilityFactor)); + fHistoryContribution += ffxMax(fLockContributionThisFrame, fLumaStabilityFactor).xxx; fHistoryContribution *= (fDepthClipFactor * fDepthClipFactor); - fHistory.xyz = FfxFloat32x3(ffxLerp(clampedHistorySample.xyz, fHistory.xyz, ffxSaturate(fHistoryContribution))); + fHistory.xyz = ffxLerp(clampedHistorySample.xyz, fHistory.xyz, ffxSaturate(fHistoryContribution)); } } -void WriteUpscaledOutput(FFX_MIN16_I2 iPxHrPos, FfxFloat32x3 fUpscaledColor) +void WriteUpscaledOutput(FfxInt32x2 iPxHrPos, FfxFloat32x3 fUpscaledColor) { StoreUpscaledOutput(iPxHrPos, fUpscaledColor); } @@ -122,68 +108,62 @@ FfxFloat32 GetLumaStabilityFactor(FfxFloat32x2 fHrUv, FfxFloat32 fHrVelocity) return fLumaStabilityFactor; } -FfxFloat32 GetLockContributionThisFrame(FfxFloat32x2 fUvCoord, FfxFloat32 fAccumulationMask, FfxFloat32 fParticleMask, LOCK_STATUS_T fLockStatus) +FfxFloat32 GetLockContributionThisFrame(FfxFloat32x2 fUvCoord, FfxFloat32 fAccumulationMask, FfxFloat32 fParticleMask, FfxFloat32x3 fLockStatus) { - const UPSAMPLE_F fNormalizedLockLifetime = GetNormalizedRemainingLockLifetime(fLockStatus); + const FfxFloat32 fNormalizedLockLifetime = GetNormalizedRemainingLockLifetime(fLockStatus); // Rectify on lock frame - FfxFloat32 fLockContributionThisFrame = ffxSaturate(fNormalizedLockLifetime * UPSAMPLE_F(4)); - - fLockContributionThisFrame *= (1.0f - fParticleMask); - //Take down contribution in transparent areas - fLockContributionThisFrame *= FfxFloat32(fAccumulationMask.r > 0.1f); + FfxFloat32 fLockContributionThisFrame = ffxSaturate(fNormalizedLockLifetime * FfxFloat32(4)); return fLockContributionThisFrame; } -void FinalizeLockStatus(FFX_MIN16_I2 iPxHrPos, LOCK_STATUS_T fLockStatus, FfxFloat32 fUpsampledWeight) +void FinalizeLockStatus(FfxInt32x2 iPxHrPos, FfxFloat32x3 fLockStatus, FfxFloat32 fUpsampledWeight) { // Increase trust - const UPSAMPLE_F fTrustIncreaseLanczosMax = UPSAMPLE_F(12); // same increase no matter the MaxAccumulationWeight() value. - const UPSAMPLE_F fTrustIncrease = UPSAMPLE_F(fUpsampledWeight / fTrustIncreaseLanczosMax); - fLockStatus[LOCK_TRUST] = ffxMin(LOCK_STATUS_F1(1), fLockStatus[LOCK_TRUST] + fTrustIncrease); + const FfxFloat32 fTrustIncreaseLanczosMax = FfxFloat32(12); // same increase no matter the MaxAccumulationWeight() value. + const FfxFloat32 fTrustIncrease = FfxFloat32(fUpsampledWeight / fTrustIncreaseLanczosMax); + fLockStatus[LOCK_TRUST] = ffxMin(FfxFloat32(1), fLockStatus[LOCK_TRUST] + fTrustIncrease); // Decrease lock lifetime - const UPSAMPLE_F fLifetimeDecreaseLanczosMax = UPSAMPLE_F(JitterSequenceLength()) * UPSAMPLE_F(averageLanczosWeightPerFrame); - const UPSAMPLE_F fLifetimeDecrease = UPSAMPLE_F(fUpsampledWeight / fLifetimeDecreaseLanczosMax); - fLockStatus[LOCK_LIFETIME_REMAINING] = ffxMax(LOCK_STATUS_F1(0), fLockStatus[LOCK_LIFETIME_REMAINING] - fLifetimeDecrease); + const FfxFloat32 fLifetimeDecreaseLanczosMax = FfxFloat32(JitterSequenceLength()) * FfxFloat32(averageLanczosWeightPerFrame); + const FfxFloat32 fLifetimeDecrease = FfxFloat32(fUpsampledWeight / fLifetimeDecreaseLanczosMax); + fLockStatus[LOCK_LIFETIME_REMAINING] = ffxMax(FfxFloat32(0), fLockStatus[LOCK_LIFETIME_REMAINING] - fLifetimeDecrease); StoreLockStatus(iPxHrPos, fLockStatus); } -UPSAMPLE_F ComputeMaxAccumulationWeight(UPSAMPLE_F fHrVelocity, UPSAMPLE_F fReactiveMax, UPSAMPLE_F fDepthClipFactor, UPSAMPLE_F fLuminanceDiff, LockState lockState) { +FfxFloat32 ComputeMaxAccumulationWeight(FfxFloat32 fHrVelocity, FfxFloat32 fReactiveMax, FfxFloat32 fDepthClipFactor, FfxFloat32 fLuminanceDiff, LockState lockState) { - UPSAMPLE_F normalizedMinimum = UPSAMPLE_F(accumulationMaxOnMotion) / UPSAMPLE_F(MaxAccumulationWeight()); + FfxFloat32 normalizedMinimum = FfxFloat32(accumulationMaxOnMotion) / FfxFloat32(MaxAccumulationWeight()); - UPSAMPLE_F fReactiveMaxAccumulationWeight = UPSAMPLE_F(1) - fReactiveMax; - UPSAMPLE_F fMotionMaxAccumulationWeight = ffxLerp(UPSAMPLE_F(1), normalizedMinimum, ffxSaturate(fHrVelocity * UPSAMPLE_F(10))); - UPSAMPLE_F fDepthClipMaxAccumulationWeight = fDepthClipFactor; + FfxFloat32 fReactiveMaxAccumulationWeight = FfxFloat32(1) - fReactiveMax; + FfxFloat32 fMotionMaxAccumulationWeight = ffxLerp(FfxFloat32(1), normalizedMinimum, ffxSaturate(fHrVelocity * FfxFloat32(10))); + FfxFloat32 fDepthClipMaxAccumulationWeight = fDepthClipFactor; - UPSAMPLE_F fLuminanceDiffMaxAccumulationWeight = ffxSaturate(ffxMax(normalizedMinimum, UPSAMPLE_F(1) - fLuminanceDiff)); + FfxFloat32 fLuminanceDiffMaxAccumulationWeight = ffxSaturate(ffxMax(normalizedMinimum, FfxFloat32(1) - fLuminanceDiff)); - UPSAMPLE_F maxAccumulation = UPSAMPLE_F(MaxAccumulationWeight()) * ffxMin( + FfxFloat32 maxAccumulation = FfxFloat32(MaxAccumulationWeight()) * ffxMin( ffxMin(fReactiveMaxAccumulationWeight, fMotionMaxAccumulationWeight), ffxMin(fDepthClipMaxAccumulationWeight, fLuminanceDiffMaxAccumulationWeight) ); - return (lockState.NewLock && !lockState.WasLockedPrevFrame) ? UPSAMPLE_F(accumulationMaxOnMotion) : maxAccumulation; + return (lockState.NewLock && !lockState.WasLockedPrevFrame) ? FfxFloat32(accumulationMaxOnMotion) : maxAccumulation; } -UPSAMPLE_F2 ComputeKernelWeight(in UPSAMPLE_F fHistoryWeight, in UPSAMPLE_F fDepthClipFactor, in UPSAMPLE_F fReactivityFactor) { - UPSAMPLE_F fKernelSizeBias = ffxSaturate(ffxMax(UPSAMPLE_F(0), fHistoryWeight - UPSAMPLE_F(0.5)) / UPSAMPLE_F(3)); +FfxFloat32x2 ComputeKernelWeight(in FfxFloat32 fHistoryWeight, in FfxFloat32 fDepthClipFactor, in FfxFloat32 fReactivityFactor) { + FfxFloat32 fKernelSizeBias = ffxSaturate(ffxMax(FfxFloat32(0), fHistoryWeight - FfxFloat32(0.5)) / FfxFloat32(3)); - //high bias on disocclusions - - UPSAMPLE_F fOneMinusReactiveMax = UPSAMPLE_F(1) - fReactivityFactor; - UPSAMPLE_F2 fKernelWeight = UPSAMPLE_F(1) + (UPSAMPLE_F(1.0f) / UPSAMPLE_F2(DownscaleFactor()) - UPSAMPLE_F(1)) * UPSAMPLE_F(fKernelSizeBias) * fOneMinusReactiveMax; + FfxFloat32 fOneMinusReactiveMax = FfxFloat32(1) - fReactivityFactor; + FfxFloat32x2 fKernelWeight = FfxFloat32(1) + (FfxFloat32(1.0f) / FfxFloat32x2(DownscaleFactor()) - FfxFloat32(1)) * FfxFloat32(fKernelSizeBias) * fOneMinusReactiveMax; //average value on disocclusion, to help decrease high value sample importance wait for accumulation to kick in - fKernelWeight *= FFX_BROADCAST_MIN_FLOAT16X2(UPSAMPLE_F(0.5) + fDepthClipFactor * UPSAMPLE_F(0.5)); + fKernelWeight *= FfxFloat32x2(0.5f, 0.5f) + fDepthClipFactor * FfxFloat32x2(0.5f, 0.5f); - return ffxMin(FFX_BROADCAST_MIN_FLOAT16X2(1.99), fKernelWeight); + return ffxMin(FfxFloat32x2(1.99f, 1.99f), fKernelWeight); } -void Accumulate(FFX_MIN16_I2 iPxHrPos) +void Accumulate(FfxInt32x2 iPxHrPos) { const FfxFloat32x2 fSamplePosHr = iPxHrPos + 0.5f; const FfxFloat32x2 fPxLrPos = fSamplePosHr * DownscaleFactor(); // Source resolution output pixel center position @@ -199,16 +179,13 @@ void Accumulate(FFX_MIN16_I2 iPxHrPos) const FfxFloat32 fHrVelocity = GetPxHrVelocity(fMotionVector); const FfxFloat32 fDepthClipFactor = ffxSaturate(SampleDepthClip(fLrUvJittered)); const FfxFloat32 fLumaStabilityFactor = GetLumaStabilityFactor(fHrUv, fHrVelocity); - const FfxFloat32 fAccumulationMask = 1.0f - TransparencyAndCompositionMaskSample(fLrUvJittered, RenderSize()).r; + const FfxFloat32x2 fDilatedReactiveMasks = SampleDilatedReactiveMasks(fLrUvJittered); + const FfxFloat32 fReactiveMax = fDilatedReactiveMasks.x; + const FfxFloat32 fAccumulationMask = fDilatedReactiveMasks.y; - FfxInt32x2 offsetTL; - offsetTL.x = (fSamplePosUnjitterLr.x > fPxLrPos.x) ? FfxInt32(0) : FfxInt32(1); - offsetTL.y = (fSamplePosUnjitterLr.y > fPxLrPos.y) ? FfxInt32(0) : FfxInt32(1); - - const UPSAMPLE_F fReactiveMax = UPSAMPLE_F(1) - Pow3(UPSAMPLE_F(1) - LoadReactiveMax(FFX_MIN16_I2(iPxLrPos + offsetTL))); - - FfxFloat32x4 fHistoryColorAndWeight = FfxFloat32x4(0.0f, 0.0f, 0.0f, 0.0f); - LOCK_STATUS_T fLockStatus = CreateNewLockSample(); + FfxFloat32x4 fHistoryColorAndWeight = FfxFloat32x4(0, 0, 0, 0); + FfxFloat32x3 fLockStatus; + InitializeNewLockSample(fLockStatus); FfxBoolean bIsExistingSample = FFX_TRUE; FfxFloat32x2 fReprojectedHrUv = FfxFloat32x2(0, 0); @@ -219,18 +196,18 @@ void Accumulate(FFX_MIN16_I2 iPxHrPos) ReprojectHistoryLockStatus(iPxHrPos, fReprojectedHrUv, fLockStatus); } - FFX_MIN16_F fLuminanceDiff = FFX_MIN16_F(0.0f); + FfxFloat32 fLuminanceDiff = FfxFloat32(0.0f); - LockState lockState = PostProcessLockStatus(iPxHrPos, fLrUvJittered, FFX_MIN16_F(fDepthClipFactor), fHrVelocity, fHistoryColorAndWeight.w, fLockStatus, fLuminanceDiff); + LockState lockState = PostProcessLockStatus(iPxHrPos, fLrUvJittered, FfxFloat32(fDepthClipFactor), fAccumulationMask, fHrVelocity, fHistoryColorAndWeight.w, fLockStatus, fLuminanceDiff); fHistoryColorAndWeight.w = ffxMin(fHistoryColorAndWeight.w, ComputeMaxAccumulationWeight( - UPSAMPLE_F(fHrVelocity), fReactiveMax, UPSAMPLE_F(fDepthClipFactor), UPSAMPLE_F(fLuminanceDiff), lockState + FfxFloat32(fHrVelocity), fReactiveMax, FfxFloat32(fDepthClipFactor), FfxFloat32(fLuminanceDiff), lockState )); - const UPSAMPLE_F fNormalizedLockLifetime = GetNormalizedRemainingLockLifetime(fLockStatus); + const FfxFloat32 fNormalizedLockLifetime = GetNormalizedRemainingLockLifetime(fLockStatus); // Kill accumulation based on shading change - fHistoryColorAndWeight.w = ffxMin(fHistoryColorAndWeight.w, FFX_MIN16_F(ffxMax(0.0f, MaxAccumulationWeight() * ffxPow(UPSAMPLE_F(1) - fLuminanceDiff, 2.0f / 1.0f)))); + fHistoryColorAndWeight.w = ffxMin(fHistoryColorAndWeight.w, FfxFloat32(ffxMax(0.0f, MaxAccumulationWeight() * ffxPow(FfxFloat32(1) - fLuminanceDiff, 2.0f / 1.0f)))); // Load upsampled input color RectificationBoxData clippingBox; @@ -240,20 +217,24 @@ void Accumulate(FFX_MIN16_I2 iPxHrPos) FfxFloat32 fReactiveWeighted = 0; // No trust in reactive areas - fLockStatus[LOCK_TRUST] = ffxMin(fLockStatus[LOCK_TRUST], LOCK_STATUS_F1(1.0f) - LOCK_STATUS_F1(pow(fReactiveMax, 1.0f / 3.0f))); - fLockStatus[LOCK_TRUST] = ffxMin(fLockStatus[LOCK_TRUST], LOCK_STATUS_F1(fDepthClipFactor)); + fLockStatus[LOCK_TRUST] = ffxMin(fLockStatus[LOCK_TRUST], FfxFloat32(1.0f) - FfxFloat32(pow(fReactiveMax, 1.0f / 3.0f))); + fLockStatus[LOCK_TRUST] = ffxMin(fLockStatus[LOCK_TRUST], FfxFloat32(fDepthClipFactor)); - UPSAMPLE_F2 fKernelWeight = ComputeKernelWeight(UPSAMPLE_F(fHistoryColorAndWeight.w), UPSAMPLE_F(fDepthClipFactor), ffxMax((UPSAMPLE_F(1) - fLockStatus[LOCK_TRUST]), fReactiveMax)); + FfxFloat32x2 fKernelWeight = ComputeKernelWeight(fHistoryColorAndWeight.w, FfxFloat32(fDepthClipFactor), ffxMax((FfxFloat32(1) - fLockStatus[LOCK_TRUST]), fReactiveMax)); - UPSAMPLE_F4 fUpsampledColorAndWeight = ComputeUpsampledColorAndWeight(iPxHrPos, fKernelWeight, clippingBox); + FfxFloat32x4 fUpsampledColorAndWeight = ComputeUpsampledColorAndWeight(iPxHrPos, fKernelWeight, clippingBox); +#if FFX_FSR2_OPTION_GUARANTEE_UPSAMPLE_WEIGHT_ON_NEW_SAMPLES + // Make sure all samples have same weight on reset/first frame. Upsampled weight should never be 0.0f when history accumulation is 0.0f. + fUpsampledColorAndWeight.w = (fHistoryColorAndWeight.w == 0.0f) ? ffxMax(FSR2_EPSILON, fUpsampledColorAndWeight.w) : fUpsampledColorAndWeight.w; +#endif FfxFloat32 fLockContributionThisFrame = GetLockContributionThisFrame(fHrUv, fAccumulationMask, fReactiveMax, fLockStatus); // Update accumulation and rectify history - if (fHistoryColorAndWeight.w > 0.0f) { + if (fHistoryColorAndWeight.w > FfxFloat32(0)) { - RectifyHistory(clippingBox, fHistoryColorAndWeight, fLockStatus, UPSAMPLE_F(fDepthClipFactor), UPSAMPLE_F(fLumaStabilityFactor), UPSAMPLE_F(fLuminanceDiff), fUpsampledColorAndWeight.w, fLockContributionThisFrame); + RectifyHistory(clippingBox, fHistoryColorAndWeight, fLockStatus, FfxFloat32(fDepthClipFactor), FfxFloat32(fLumaStabilityFactor), FfxFloat32(fLuminanceDiff), fUpsampledColorAndWeight.w, fLockContributionThisFrame); fHistoryColorAndWeight.rgb = YCoCgToRGB(fHistoryColorAndWeight.rgb); } @@ -261,12 +242,12 @@ void Accumulate(FFX_MIN16_I2 iPxHrPos) Accumulate(iPxHrPos, fHistoryColorAndWeight, fUpsampledColorAndWeight, fDepthClipFactor, fHrVelocity); //Subtract accumulation weight in reactive areas - fHistoryColorAndWeight.w -= FfxFloat32(fUpsampledColorAndWeight.w * fReactiveMax); + fHistoryColorAndWeight.w -= fUpsampledColorAndWeight.w * fReactiveMax; #if FFX_FSR2_OPTION_HDR_COLOR_INPUT fHistoryColorAndWeight.rgb = InverseTonemap(fHistoryColorAndWeight.rgb); #endif - fHistoryColorAndWeight.rgb /= Exposure(); + fHistoryColorAndWeight.rgb /= FfxFloat32(Exposure()); FinalizeLockStatus(iPxHrPos, fLockStatus, fUpsampledColorAndWeight.w); @@ -277,3 +258,5 @@ void Accumulate(FFX_MIN16_I2 iPxHrPos) WriteUpscaledOutput(iPxHrPos, fHistoryColorAndWeight.rgb); #endif } + +#endif // FFX_FSR2_ACCUMULATE_H diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_accumulate_pass.glsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_accumulate_pass.glsl index bf85d1d..e1ee116 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_accumulate_pass.glsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_accumulate_pass.glsl @@ -44,8 +44,12 @@ #extension GL_EXT_samplerless_texture_functions : require #define FSR2_BIND_SRV_EXPOSURE 0 -#define FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK 1 +#define FSR2_BIND_SRV_DILATED_REACTIVE_MASKS 1 +#if FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS #define FSR2_BIND_SRV_DILATED_MOTION_VECTORS 2 +#else +#define FSR2_BIND_SRV_MOTION_VECTORS 2 +#endif #define FSR2_BIND_SRV_INTERNAL_UPSCALED 3 #define FSR2_BIND_SRV_LOCK_STATUS 4 #define FSR2_BIND_SRV_DEPTH_CLIP 5 @@ -53,13 +57,12 @@ #define FSR2_BIND_SRV_LUMA_HISTORY 7 #define FSR2_BIND_SRV_LANCZOS_LUT 8 #define FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT 9 -#define FSR2_BIND_SRV_REACTIVE_MAX 10 -#define FSR2_BIND_SRV_EXPOSURE_MIPS 11 -#define FSR2_BIND_UAV_INTERNAL_UPSCALED 12 -#define FSR2_BIND_UAV_LOCK_STATUS 13 -#define FSR2_BIND_UAV_UPSCALED_OUTPUT 14 +#define FSR2_BIND_SRV_EXPOSURE_MIPS 10 +#define FSR2_BIND_UAV_INTERNAL_UPSCALED 11 +#define FSR2_BIND_UAV_LOCK_STATUS 12 +#define FSR2_BIND_UAV_UPSCALED_OUTPUT 13 -#define FSR2_BIND_CB_FSR2 15 +#define FSR2_BIND_CB_FSR2 14 #include "ffx_fsr2_callbacks_glsl.h" #include "ffx_fsr2_common.h" @@ -92,5 +95,5 @@ void main() uvec2 uDispatchThreadId = uGroupId * uvec2(FFX_FSR2_THREAD_GROUP_WIDTH, FFX_FSR2_THREAD_GROUP_HEIGHT) + gl_LocalInvocationID.xy; - Accumulate(FFX_MIN16_I2(uDispatchThreadId)); -} + Accumulate(ivec2(uDispatchThreadId)); +} \ No newline at end of file diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_accumulate_pass.hlsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_accumulate_pass.hlsl index d66b075..4321f99 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_accumulate_pass.hlsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_accumulate_pass.hlsl @@ -30,7 +30,7 @@ // SRV 14 : FSR2_LumaHistory : r_luma_history // SRV 16 : FSR2_LanczosLutData : r_lanczos_lut // SRV 26 : FSR2_MaximumUpsampleBias : r_upsample_maximum_bias_lut -// SRV 27 : FSR2_ReactiveMaskMax : r_reactive_max +// SRV 27 : FSR2_DilatedReactiveMasks : r_dilated_reactive_masks // SRV 28 : FSR2_ExposureMips : r_imgMips // UAV 10 : FSR2_InternalUpscaled1 : rw_internal_upscaled_color // UAV 11 : FSR2_LockStatus1 : rw_lock_status @@ -39,8 +39,11 @@ // CB 1 : FSR2DispatchOffsets #define FSR2_BIND_SRV_EXPOSURE 0 -#define FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK 1 +#if FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS #define FSR2_BIND_SRV_DILATED_MOTION_VECTORS 2 +#else +#define FSR2_BIND_SRV_MOTION_VECTORS 2 +#endif #define FSR2_BIND_SRV_INTERNAL_UPSCALED 3 #define FSR2_BIND_SRV_LOCK_STATUS 4 #define FSR2_BIND_SRV_DEPTH_CLIP 5 @@ -48,7 +51,7 @@ #define FSR2_BIND_SRV_LUMA_HISTORY 7 #define FSR2_BIND_SRV_LANCZOS_LUT 8 #define FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT 9 -#define FSR2_BIND_SRV_REACTIVE_MAX 10 +#define FSR2_BIND_SRV_DILATED_REACTIVE_MASKS 10 #define FSR2_BIND_SRV_EXPOSURE_MIPS 11 #define FSR2_BIND_UAV_INTERNAL_UPSCALED 0 #define FSR2_BIND_UAV_LOCK_STATUS 1 @@ -86,5 +89,5 @@ void CS(uint2 uGroupId : SV_GroupID, uint2 uGroupThreadId : SV_GroupThreadID) uint2 uDispatchThreadId = uGroupId * uint2(FFX_FSR2_THREAD_GROUP_WIDTH, FFX_FSR2_THREAD_GROUP_HEIGHT) + uGroupThreadId; - Accumulate(min16int2(uDispatchThreadId)); + Accumulate(uDispatchThreadId); } diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_autogen_reactive_pass.glsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_autogen_reactive_pass.glsl index bef70f8..b509eb0 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_autogen_reactive_pass.glsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_autogen_reactive_pass.glsl @@ -28,14 +28,14 @@ #define FSR2_BIND_SRV_POST_ALPHA_COLOR 1 #define FSR2_BIND_UAV_REACTIVE 2 #define FSR2_BIND_CB_REACTIVE 3 -#define FSR2_BIND_CB_FSR2 4 +#define FSR2_BIND_CB_FSR2 4 #include "ffx_fsr2_callbacks_glsl.h" #include "ffx_fsr2_common.h" -layout (set = 1, binding = FSR2_BIND_SRV_PRE_ALPHA_COLOR) uniform texture2D r_input_color_pre_alpha; -layout (set = 1, binding = FSR2_BIND_SRV_POST_ALPHA_COLOR) uniform texture2D r_input_color_post_alpha; -layout (set = 1, binding = FSR2_BIND_UAV_REACTIVE, r8) uniform image2D rw_output_reactive_mask; +layout (set = 1, binding = FSR2_BIND_SRV_PRE_ALPHA_COLOR) uniform texture2D r_input_color_pre_alpha; +layout (set = 1, binding = FSR2_BIND_SRV_POST_ALPHA_COLOR) uniform texture2D r_input_color_post_alpha; +layout (set = 1, binding = FSR2_BIND_UAV_REACTIVE, r8) uniform image2D rw_output_reactive_mask; #ifndef FFX_FSR2_THREAD_GROUP_WIDTH @@ -55,8 +55,8 @@ layout (set = 1, binding = FSR2_BIND_CB_REACTIVE, std140) uniform cbGenerateReac { float scale; float threshold; + float binaryValue; uint flags; - float _padding_; } cbGenerateReactive; FFX_FSR2_NUM_THREADS @@ -85,7 +85,7 @@ void main() out_reactive_value = ((cbGenerateReactive.flags & FFX_FSR2_AUTOREACTIVEFLAGS_USE_COMPONENTS_MAX)!=0) ? max(delta.x, max(delta.y, delta.z)) : length(delta); out_reactive_value *= cbGenerateReactive.scale; - out_reactive_value = ((cbGenerateReactive.flags & FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_THRESHOLD)!=0) ? ((out_reactive_value < cbGenerateReactive.threshold) ? 0 : 1) : out_reactive_value; + out_reactive_value = ((cbGenerateReactive.flags & FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_THRESHOLD)!=0) ? ((out_reactive_value < cbGenerateReactive.threshold) ? 0 : cbGenerateReactive.binaryValue) : out_reactive_value; imageStore(rw_output_reactive_mask, FfxInt32x2(uDispatchThreadId), vec4(out_reactive_value)); } diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_autogen_reactive_pass.hlsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_autogen_reactive_pass.hlsl index 0528cd0..903ceae 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_autogen_reactive_pass.hlsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_autogen_reactive_pass.hlsl @@ -48,8 +48,8 @@ cbuffer cbGenerateReactive : register(b0) { float scale; float threshold; + float binaryValue; uint flags; - float _padding_; }; FFX_FSR2_NUM_THREADS @@ -79,7 +79,7 @@ void CS(uint2 uGroupId : SV_GroupID, uint2 uGroupThreadId : SV_GroupThreadID) out_reactive_value = (flags & FFX_FSR2_AUTOREACTIVEFLAGS_USE_COMPONENTS_MAX) ? max(delta.x, max(delta.y, delta.z)) : length(delta); out_reactive_value *= scale; - out_reactive_value = (flags & FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_THRESHOLD) ? (out_reactive_value < threshold ? 0 : 1) : out_reactive_value; + out_reactive_value = (flags & FFX_FSR2_AUTOREACTIVEFLAGS_APPLY_THRESHOLD) ? (out_reactive_value < threshold ? 0 : binaryValue) : out_reactive_value; rw_output_reactive_mask[uDispatchThreadId] = out_reactive_value; } diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_callbacks_glsl.h b/src/ffx-fsr2-api/shaders/ffx_fsr2_callbacks_glsl.h index e92e680..2cd1d15 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_callbacks_glsl.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_callbacks_glsl.h @@ -18,7 +18,6 @@ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. - #include "ffx_fsr2_resources.h" #if defined(FFX_GPU) @@ -33,11 +32,11 @@ #if defined(FSR2_BIND_CB_FSR2) layout (set = 1, binding = FSR2_BIND_CB_FSR2, std140) uniform cbFSR2_t { - FfxInt32x2 iRenderSize; - FfxInt32x2 iDisplaySize; - FfxUInt32x2 uLumaMipDimensions; - FfxUInt32 uLumaMipLevelToUse; - FfxUInt32 uFrameIndex; + FfxInt32x2 iRenderSize; + FfxInt32x2 iDisplaySize; + FfxInt32x2 uLumaMipDimensions; + FfxInt32 uLumaMipLevelToUse; + FfxInt32 uFrameIndex; FfxFloat32x2 fDisplaySizeRcp; FfxFloat32x2 fJitter; FfxFloat32x4 fDeviceToViewDepth; @@ -46,15 +45,15 @@ FfxFloat32x2 reactive_mask_dim_rcp; FfxFloat32x2 MotionVectorScale; FfxFloat32x2 fDownscaleFactor; - FfxFloat32 fPreExposure; - FfxFloat32 fTanHalfFOV; + FfxFloat32 fPreExposure; + FfxFloat32 fTanHalfFOV; FfxFloat32x2 fMotionVectorJitterCancellation; - FfxFloat32 fJitterSequenceLength; - FfxFloat32 fLockInitialLifetime; - FfxFloat32 fLockTickDelta; - FfxFloat32 fDeltaTime; - FfxFloat32 fDynamicResChangeFactor; - FfxFloat32 fLumaMipRcp; + FfxFloat32 fJitterSequenceLength; + FfxFloat32 fLockInitialLifetime; + FfxFloat32 fLockTickDelta; + FfxFloat32 fDeltaTime; + FfxFloat32 fDynamicResChangeFactor; + FfxFloat32 fLumaMipRcp; } cbFSR2; #endif @@ -63,12 +62,12 @@ FfxFloat32 LumaMipRcp() return cbFSR2.fLumaMipRcp; } -FfxUInt32x2 LumaMipDimensions() +FfxInt32x2 LumaMipDimensions() { return cbFSR2.uLumaMipDimensions; } -FfxUInt32 LumaMipLevelToUse() +FfxInt32 LumaMipLevelToUse() { return cbFSR2.uLumaMipLevelToUse; } @@ -135,7 +134,7 @@ FfxFloat32 DynamicResChangeFactor() return cbFSR2.fDynamicResChangeFactor; } -FfxUInt32 FrameIndex() +FfxInt32 FrameIndex() { return cbFSR2.uFrameIndex; } @@ -143,121 +142,110 @@ FfxUInt32 FrameIndex() layout (set = 0, binding = 0) uniform sampler s_PointClamp; layout (set = 0, binding = 1) uniform sampler s_LinearClamp; -#define PREPARED_INPUT_COLOR_T FFX_MIN16_F4 -#define PREPARED_INPUT_COLOR_F3 FFX_MIN16_F3 -#define PREPARED_INPUT_COLOR_F1 FFX_MIN16_F - -#define UPSAMPLED_COLOR_T FfxFloat32x3 - -#define RW_UPSAMPLED_WEIGHT_T FfxFloat32 - -#define LOCK_STATUS_T FFX_MIN16_F3 -#define LOCK_STATUS_F1 FFX_MIN16_F - // SRVs #if defined(FSR2_BIND_SRV_INPUT_COLOR) - layout (set = 1, binding = FSR2_BIND_SRV_INPUT_COLOR) uniform texture2D r_input_color_jittered; + layout (set = 1, binding = FSR2_BIND_SRV_INPUT_COLOR) uniform texture2D r_input_color_jittered; #endif #if defined(FSR2_BIND_SRV_MOTION_VECTORS) - layout (set = 1, binding = FSR2_BIND_SRV_MOTION_VECTORS) uniform texture2D r_motion_vectors; + layout (set = 1, binding = FSR2_BIND_SRV_MOTION_VECTORS) uniform texture2D r_motion_vectors; #endif #if defined(FSR2_BIND_SRV_DEPTH) - layout (set = 1, binding = FSR2_BIND_SRV_DEPTH) uniform texture2D r_depth; + layout (set = 1, binding = FSR2_BIND_SRV_DEPTH) uniform texture2D r_depth; #endif #if defined(FSR2_BIND_SRV_EXPOSURE) - layout (set = 1, binding = FSR2_BIND_SRV_EXPOSURE) uniform texture2D r_exposure; + layout (set = 1, binding = FSR2_BIND_SRV_EXPOSURE) uniform texture2D r_exposure; #endif #if defined(FSR2_BIND_SRV_REACTIVE_MASK) - layout (set = 1, binding = FSR2_BIND_SRV_REACTIVE_MASK) uniform texture2D r_reactive_mask; + layout (set = 1, binding = FSR2_BIND_SRV_REACTIVE_MASK) uniform texture2D r_reactive_mask; #endif #if defined(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK) - layout (set = 1, binding = FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK) uniform texture2D r_transparency_and_composition_mask; + layout (set = 1, binding = FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK) uniform texture2D r_transparency_and_composition_mask; #endif #if defined(FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH) - layout (set = 1, binding = FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH) uniform utexture2D r_ReconstructedPrevNearestDepth; + layout (set = 1, binding = FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH) uniform utexture2D r_reconstructed_previous_nearest_depth; #endif #if defined(FSR2_BIND_SRV_DILATED_MOTION_VECTORS) - layout (set = 1, binding = FSR2_BIND_SRV_DILATED_MOTION_VECTORS) uniform texture2D r_dilated_motion_vectors; + layout (set = 1, binding = FSR2_BIND_SRV_DILATED_MOTION_VECTORS) uniform texture2D r_dilated_motion_vectors; #endif #if defined(FSR2_BIND_SRV_DILATED_DEPTH) - layout (set = 1, binding = FSR2_BIND_SRV_DILATED_DEPTH) uniform texture2D r_dilatedDepth; + layout (set = 1, binding = FSR2_BIND_SRV_DILATED_DEPTH) uniform texture2D r_dilatedDepth; #endif #if defined(FSR2_BIND_SRV_INTERNAL_UPSCALED) - layout (set = 1, binding = FSR2_BIND_SRV_INTERNAL_UPSCALED) uniform texture2D r_internal_upscaled_color; + layout (set = 1, binding = FSR2_BIND_SRV_INTERNAL_UPSCALED) uniform texture2D r_internal_upscaled_color; #endif #if defined(FSR2_BIND_SRV_LOCK_STATUS) - layout (set = 1, binding = FSR2_BIND_SRV_LOCK_STATUS) uniform texture2D r_lock_status; + layout (set = 1, binding = FSR2_BIND_SRV_LOCK_STATUS) uniform texture2D r_lock_status; #endif #if defined(FSR2_BIND_SRV_DEPTH_CLIP) - layout (set = 1, binding = FSR2_BIND_SRV_DEPTH_CLIP) uniform texture2D r_depth_clip; + layout (set = 1, binding = FSR2_BIND_SRV_DEPTH_CLIP) uniform texture2D r_depth_clip; #endif #if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) - layout (set = 1, binding = FSR2_BIND_SRV_PREPARED_INPUT_COLOR) uniform texture2D r_prepared_input_color; + layout (set = 1, binding = FSR2_BIND_SRV_PREPARED_INPUT_COLOR) uniform texture2D r_prepared_input_color; #endif #if defined(FSR2_BIND_SRV_LUMA_HISTORY) - layout (set = 1, binding = FSR2_BIND_SRV_LUMA_HISTORY) uniform texture2D r_luma_history; + layout (set = 1, binding = FSR2_BIND_SRV_LUMA_HISTORY) uniform texture2D r_luma_history; #endif #if defined(FSR2_BIND_SRV_RCAS_INPUT) - layout (set = 1, binding = FSR2_BIND_SRV_RCAS_INPUT) uniform texture2D r_rcas_input; + layout (set = 1, binding = FSR2_BIND_SRV_RCAS_INPUT) uniform texture2D r_rcas_input; #endif #if defined(FSR2_BIND_SRV_LANCZOS_LUT) - layout (set = 1, binding = FSR2_BIND_SRV_LANCZOS_LUT) uniform texture2D r_lanczos_lut; + layout (set = 1, binding = FSR2_BIND_SRV_LANCZOS_LUT) uniform texture2D r_lanczos_lut; #endif #if defined(FSR2_BIND_SRV_EXPOSURE_MIPS) - layout (set = 1, binding = FSR2_BIND_SRV_EXPOSURE_MIPS) uniform texture2D r_imgMips; + layout (set = 1, binding = FSR2_BIND_SRV_EXPOSURE_MIPS) uniform texture2D r_imgMips; #endif #if defined(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT) - layout (set = 1, binding = FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT) uniform texture2D r_upsample_maximum_bias_lut; + layout (set = 1, binding = FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT) uniform texture2D r_upsample_maximum_bias_lut; #endif -#if defined(FSR2_BIND_SRV_REACTIVE_MAX) - layout (set = 1, binding = FSR2_BIND_SRV_REACTIVE_MAX) uniform texture2D r_reactive_max; +#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) + layout (set = 1, binding = FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) uniform texture2D r_dilated_reactive_masks; #endif // UAV #if defined FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH - layout (set = 1, binding = FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH, r32ui) uniform uimage2D rw_ReconstructedPrevNearestDepth; + layout (set = 1, binding = FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH, r32ui) uniform uimage2D rw_reconstructed_previous_nearest_depth; #endif #if defined FSR2_BIND_UAV_DILATED_MOTION_VECTORS - layout (set = 1, binding = FSR2_BIND_UAV_DILATED_MOTION_VECTORS, rg32f) uniform image2D rw_dilated_motion_vectors; + layout (set = 1, binding = FSR2_BIND_UAV_DILATED_MOTION_VECTORS, rg32f) uniform image2D rw_dilated_motion_vectors; #endif #if defined FSR2_BIND_UAV_DILATED_DEPTH - layout (set = 1, binding = FSR2_BIND_UAV_DILATED_DEPTH, r32f) uniform image2D rw_dilatedDepth; + layout (set = 1, binding = FSR2_BIND_UAV_DILATED_DEPTH, r32f) uniform image2D rw_dilatedDepth; #endif #if defined FSR2_BIND_UAV_INTERNAL_UPSCALED - layout (set = 1, binding = FSR2_BIND_UAV_INTERNAL_UPSCALED, rgba32f) uniform image2D rw_internal_upscaled_color; + layout (set = 1, binding = FSR2_BIND_UAV_INTERNAL_UPSCALED, rgba32f) uniform image2D rw_internal_upscaled_color; #endif #if defined FSR2_BIND_UAV_LOCK_STATUS - layout (set = 1, binding = FSR2_BIND_UAV_LOCK_STATUS, r11f_g11f_b10f) uniform image2D rw_lock_status; + layout (set = 1, binding = FSR2_BIND_UAV_LOCK_STATUS, r11f_g11f_b10f) uniform image2D rw_lock_status; #endif #if defined FSR2_BIND_UAV_DEPTH_CLIP - layout (set = 1, binding = FSR2_BIND_UAV_DEPTH_CLIP, r32f) uniform image2D rw_depth_clip; + layout (set = 1, binding = FSR2_BIND_UAV_DEPTH_CLIP, r32f) uniform image2D rw_depth_clip; #endif #if defined FSR2_BIND_UAV_PREPARED_INPUT_COLOR - layout (set = 1, binding = FSR2_BIND_UAV_PREPARED_INPUT_COLOR, rgba32f) uniform image2D rw_prepared_input_color; + layout (set = 1, binding = FSR2_BIND_UAV_PREPARED_INPUT_COLOR, rgba32f) uniform image2D rw_prepared_input_color; #endif #if defined FSR2_BIND_UAV_LUMA_HISTORY - layout (set = 1, binding = FSR2_BIND_UAV_LUMA_HISTORY, rgba32f) uniform image2D rw_luma_history; + layout (set = 1, binding = FSR2_BIND_UAV_LUMA_HISTORY, rgba32f) uniform image2D rw_luma_history; #endif #if defined FSR2_BIND_UAV_UPSCALED_OUTPUT - layout (set = 1, binding = FSR2_BIND_UAV_UPSCALED_OUTPUT, rgba32f) uniform image2D rw_upscaled_output; + layout (set = 1, binding = FSR2_BIND_UAV_UPSCALED_OUTPUT, rgba32f) uniform image2D rw_upscaled_output; #endif #if defined FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE - layout (set = 1, binding = FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE, r32f) coherent uniform image2D rw_img_mip_shading_change; + layout (set = 1, binding = FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE, r32f) coherent uniform image2D rw_img_mip_shading_change; #endif #if defined FSR2_BIND_UAV_EXPOSURE_MIP_5 - layout (set = 1, binding = FSR2_BIND_UAV_EXPOSURE_MIP_5, r32f) coherent uniform image2D rw_img_mip_5; + layout (set = 1, binding = FSR2_BIND_UAV_EXPOSURE_MIP_5, r32f) coherent uniform image2D rw_img_mip_5; #endif -#if defined FSR2_BIND_UAV_REACTIVE_MASK_MAX - layout (set = 1, binding = FSR2_BIND_UAV_REACTIVE_MASK_MAX, r32f) uniform image2D rw_reactive_max; +#if defined FSR2_BIND_UAV_DILATED_REACTIVE_MASKS + layout (set = 1, binding = FSR2_BIND_UAV_DILATED_REACTIVE_MASKS, rg32f) uniform image2D rw_dilated_reactive_masks; #endif #if defined FSR2_BIND_UAV_EXPOSURE - layout (set = 1, binding = FSR2_BIND_UAV_EXPOSURE, rg32f) uniform image2D rw_exposure; + layout (set = 1, binding = FSR2_BIND_UAV_EXPOSURE, rg32f) uniform image2D rw_exposure; #endif #if defined FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC - layout (set = 1, binding = FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC, r32ui) coherent uniform uimage2D rw_spd_global_atomic; + layout (set = 1, binding = FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC, r32ui) coherent uniform uimage2D rw_spd_global_atomic; #endif -FfxFloat32 LoadMipLuma(FfxInt32x2 iPxPos, FfxUInt32 mipLevel) +FfxFloat32 LoadMipLuma(FfxInt32x2 iPxPos, FfxInt32 mipLevel) { #if defined(FSR2_BIND_SRV_EXPOSURE_MIPS) return texelFetch(r_imgMips, iPxPos, FfxInt32(mipLevel)).r; @@ -265,17 +253,9 @@ FfxFloat32 LoadMipLuma(FfxInt32x2 iPxPos, FfxUInt32 mipLevel) return 0.f; #endif } -#if FFX_HALF -FfxFloat16 LoadMipLuma(FfxInt16x2 iPxPos, FfxUInt16 mipLevel) -{ -#if defined(FSR2_BIND_SRV_EXPOSURE_MIPS) - return FfxFloat16(texelFetch(r_imgMips, iPxPos, FfxInt32(mipLevel)).r); -#else - return FfxFloat16(0.f); -#endif -} -#endif -FfxFloat32 SampleMipLuma(FfxFloat32x2 fUV, FfxUInt32 mipLevel) + + +FfxFloat32 SampleMipLuma(FfxFloat32x2 fUV, FfxInt32 mipLevel) { #if defined(FSR2_BIND_SRV_EXPOSURE_MIPS) fUV *= cbFSR2.depthclip_uv_scale; @@ -284,18 +264,6 @@ FfxFloat32 SampleMipLuma(FfxFloat32x2 fUV, FfxUInt32 mipLevel) return 0.f; #endif } -#if FFX_HALF -FfxFloat16 SampleMipLuma(FfxFloat16x2 fUV, FfxUInt32 mipLevel) -{ -#if defined(FSR2_BIND_SRV_EXPOSURE_MIPS) - fUV *= FfxFloat16x2(cbFSR2.depthclip_uv_scale); - return FfxFloat16(textureLod(sampler2D(r_imgMips, s_LinearClamp), fUV, FfxFloat32(mipLevel)).r); -#else - return FfxFloat16(0.f); -#endif -} -#endif - // // a 0 0 0 x @@ -324,7 +292,7 @@ FfxFloat32 LoadInputDepth(FfxInt32x2 iPxPos) #endif } -FfxFloat32 LoadReactiveMask(FFX_MIN16_I2 iPxPos) +FfxFloat32 LoadReactiveMask(FfxInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_REACTIVE_MASK) return texelFetch(r_reactive_mask, FfxInt32x2(iPxPos), 0).r; @@ -342,12 +310,22 @@ FfxFloat32x4 GatherReactiveMask(FfxInt32x2 iPxPos) #endif } -FFX_MIN16_F LoadTransparencyAndCompositionMask(FFX_MIN16_I2 iPxPos) +FfxFloat32 LoadTransparencyAndCompositionMask(FfxInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK) - return FFX_MIN16_F(texelFetch(r_transparency_and_composition_mask, FfxInt32x2(iPxPos), 0).r); + return texelFetch(r_transparency_and_composition_mask, iPxPos, 0).r; #else - return FFX_MIN16_F(0.f); + return 0.f; +#endif +} + +FfxFloat32 SampleTransparencyAndCompositionMask(FfxFloat32x2 fUV) +{ +#if defined(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK) + fUV *= cbFSR2.depthclip_uv_scale; + return textureLod(sampler2D(r_transparency_and_composition_mask, s_LinearClamp), fUV, 0.0f).x; +#else + return 0.f; #endif } @@ -365,48 +343,37 @@ FfxFloat32x3 LoadInputColor(FfxInt32x2 iPxPos) #endif } -FfxFloat32x3 LoadInputColorWithoutPreExposure(FFX_MIN16_I2 iPxPos) +FfxFloat32x3 LoadInputColorWithoutPreExposure(FfxInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_INPUT_COLOR) - return texelFetch(r_input_color_jittered, FfxInt32x2(iPxPos), 0).rgb; + return texelFetch(r_input_color_jittered, iPxPos, 0).rgb; #else return FfxFloat32x3(0.f); #endif } -#if FFX_HALF -FfxFloat16x3 LoadPreparedInputColor(FfxInt16x2 iPxPos) +FfxFloat32x3 LoadPreparedInputColor(FfxInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) - return FfxFloat16x3(texelFetch(r_prepared_input_color, FfxInt32x2(iPxPos), 0).rgb); + return texelFetch(r_prepared_input_color, iPxPos, 0).rgb; #else - return FfxFloat16x3(0.f); -#endif -} -#endif // #if FFX_HALF - -FFX_MIN16_F3 LoadPreparedInputColor(FfxInt32x2 iPxPos) -{ -#if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) - return FFX_MIN16_F3(texelFetch(r_prepared_input_color, iPxPos, 0).rgb); -#else - return FFX_MIN16_F3(0.f); + return FfxFloat32x3(0.f); #endif } -FFX_MIN16_F LoadPreparedInputColorLuma(FFX_MIN16_I2 iPxPos) +FfxFloat32 LoadPreparedInputColorLuma(FfxInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) - return FFX_MIN16_F(texelFetch(r_prepared_input_color, iPxPos, 0).a); + return texelFetch(r_prepared_input_color, iPxPos, 0).a; #else - return FFX_MIN16_F(0.f); + return 0.f; #endif } -FfxFloat32x2 LoadInputMotionVector(FFX_MIN16_I2 iPxDilatedMotionVectorPos) +FfxFloat32x2 LoadInputMotionVector(FfxInt32x2 iPxDilatedMotionVectorPos) { #if defined(FSR2_BIND_SRV_MOTION_VECTORS) - FfxFloat32x2 fSrcMotionVector = texelFetch(r_motion_vectors, FfxInt32x2(iPxDilatedMotionVectorPos), 0).xy; + FfxFloat32x2 fSrcMotionVector = texelFetch(r_motion_vectors, iPxDilatedMotionVectorPos, 0).xy; #else FfxFloat32x2 fSrcMotionVector = FfxFloat32x2(0.f); #endif @@ -420,32 +387,32 @@ FfxFloat32x2 LoadInputMotionVector(FFX_MIN16_I2 iPxDilatedMotionVectorPos) return fUvMotionVector; } -FFX_MIN16_F4 LoadHistory(FfxInt32x2 iPxHistory) +FfxFloat32x4 LoadHistory(FfxInt32x2 iPxHistory) { #if defined(FSR2_BIND_SRV_INTERNAL_UPSCALED) - return FFX_MIN16_F4(texelFetch(r_internal_upscaled_color, iPxHistory, 0)); + return texelFetch(r_internal_upscaled_color, iPxHistory, 0); #else - return FFX_MIN16_F4(0.f); + return FfxFloat32x4(0.0f); #endif } -FfxFloat32x4 LoadRwInternalUpscaledColorAndWeight(FFX_MIN16_I2 iPxPos) +FfxFloat32x4 LoadRwInternalUpscaledColorAndWeight(FfxInt32x2 iPxPos) { #if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED) - return imageLoad(rw_internal_upscaled_color, FfxInt32x2(iPxPos)); + return imageLoad(rw_internal_upscaled_color, iPxPos); #else return FfxFloat32x4(0.f); #endif } -void StoreLumaHistory(FFX_MIN16_I2 iPxPos, FfxFloat32x4 fLumaHistory) +void StoreLumaHistory(FfxInt32x2 iPxPos, FfxFloat32x4 fLumaHistory) { #if defined(FSR2_BIND_UAV_LUMA_HISTORY) imageStore(rw_luma_history, FfxInt32x2(iPxPos), fLumaHistory); #endif } -FfxFloat32x4 LoadRwLumaHistory(FFX_MIN16_I2 iPxPos) +FfxFloat32x4 LoadRwLumaHistory(FfxInt32x2 iPxPos) { #if defined(FSR2_BIND_UAV_LUMA_HISTORY) return imageLoad(rw_luma_history, FfxInt32x2(iPxPos)); @@ -454,7 +421,7 @@ FfxFloat32x4 LoadRwLumaHistory(FFX_MIN16_I2 iPxPos) #endif } -FfxFloat32 LoadLumaStabilityFactor(FFX_MIN16_I2 iPxPos) +FfxFloat32 LoadLumaStabilityFactor(FfxInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_LUMA_HISTORY) return texelFetch(r_luma_history, FfxInt32x2(iPxPos), 0).w; @@ -473,63 +440,63 @@ FfxFloat32 SampleLumaStabilityFactor(FfxFloat32x2 fUV) #endif } -void StoreReprojectedHistory(FFX_MIN16_I2 iPxHistory, FFX_MIN16_F4 fHistory) +void StoreReprojectedHistory(FfxInt32x2 iPxHistory, FfxFloat32x4 fHistory) { #if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED) imageStore(rw_internal_upscaled_color, iPxHistory, fHistory); #endif } -void StoreInternalColorAndWeight(FFX_MIN16_I2 iPxPos, FfxFloat32x4 fColorAndWeight) +void StoreInternalColorAndWeight(FfxInt32x2 iPxPos, FfxFloat32x4 fColorAndWeight) { #if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED) imageStore(rw_internal_upscaled_color, FfxInt32x2(iPxPos), fColorAndWeight); #endif } -void StoreUpscaledOutput(FFX_MIN16_I2 iPxPos, FfxFloat32x3 fColor) +void StoreUpscaledOutput(FfxInt32x2 iPxPos, FfxFloat32x3 fColor) { #if defined(FSR2_BIND_UAV_UPSCALED_OUTPUT) imageStore(rw_upscaled_output, FfxInt32x2(iPxPos), FfxFloat32x4(fColor * PreExposure(), 1.f)); #endif } -LOCK_STATUS_T LoadLockStatus(FFX_MIN16_I2 iPxPos) +FfxFloat32x3 LoadLockStatus(FfxInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_LOCK_STATUS) - LOCK_STATUS_T fLockStatus = LOCK_STATUS_T(texelFetch(r_lock_status, iPxPos, 0).rgb); + FfxFloat32x3 fLockStatus = texelFetch(r_lock_status, iPxPos, 0).rgb; - fLockStatus[0] -= LOCK_STATUS_F1(LockInitialLifetime() * 2.0f); + fLockStatus[0] -= LockInitialLifetime() * 2.0f; return fLockStatus; #else - return LOCK_STATUS_T(0.f); + return FfxFloat32x3(0.f); #endif } -LOCK_STATUS_T LoadRwLockStatus(FfxInt32x2 iPxPos) +FfxFloat32x3 LoadRwLockStatus(FfxInt32x2 iPxPos) { #if defined(FSR2_BIND_UAV_LOCK_STATUS) - LOCK_STATUS_T fLockStatus = LOCK_STATUS_T(imageLoad(rw_lock_status, iPxPos).rgb); + FfxFloat32x3 fLockStatus = imageLoad(rw_lock_status, iPxPos).rgb; - fLockStatus[0] -= LOCK_STATUS_F1(LockInitialLifetime() * 2.0f); + fLockStatus[0] -= LockInitialLifetime() * 2.0f; return fLockStatus; #else - return LOCK_STATUS_T(0.f); + return FfxFloat32x3(0.f); #endif } -void StoreLockStatus(FFX_MIN16_I2 iPxPos, LOCK_STATUS_T fLockstatus) +void StoreLockStatus(FfxInt32x2 iPxPos, FfxFloat32x3 fLockstatus) { #if defined(FSR2_BIND_UAV_LOCK_STATUS) - fLockstatus[0] += LOCK_STATUS_F1(LockInitialLifetime() * 2.0f); + fLockstatus[0] += LockInitialLifetime() * 2.0f; imageStore(rw_lock_status, iPxPos, vec4(fLockstatus, 0.0f)); #endif } -void StorePreparedInputColor(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN PREPARED_INPUT_COLOR_T fTonemapped) +void StorePreparedInputColor(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x4 fTonemapped) { #if defined(FSR2_BIND_UAV_PREPARED_INPUT_COLOR) imageStore(rw_prepared_input_color, iPxPos, fTonemapped); @@ -541,7 +508,7 @@ FfxBoolean IsResponsivePixel(FfxInt32x2 iPxPos) return FFX_FALSE; //not supported in prototype } -FfxFloat32 LoadDepthClip(FFX_MIN16_I2 iPxPos) +FfxFloat32 LoadDepthClip(FfxInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_DEPTH_CLIP) return texelFetch(r_depth_clip, iPxPos, 0).r; @@ -560,19 +527,19 @@ FfxFloat32 SampleDepthClip(FfxFloat32x2 fUV) #endif } -LOCK_STATUS_T SampleLockStatus(FfxFloat32x2 fUV) +FfxFloat32x3 SampleLockStatus(FfxFloat32x2 fUV) { #if defined(FSR2_BIND_SRV_LOCK_STATUS) fUV *= cbFSR2.postprocessed_lockstatus_uv_scale; - LOCK_STATUS_T fLockStatus = LOCK_STATUS_T(textureLod(sampler2D(r_lock_status, s_LinearClamp), fUV, 0.0f).rgb); - fLockStatus[0] -= LOCK_STATUS_F1(LockInitialLifetime() * 2.0f); + FfxFloat32x3 fLockStatus = textureLod(sampler2D(r_lock_status, s_LinearClamp), fUV, 0.0f).rgb; + fLockStatus[0] -= LockInitialLifetime() * 2.0f; return fLockStatus; #else - return LOCK_STATUS_T(0.f); + return FfxFloat32x3(0.f); #endif } -void StoreDepthClip(FFX_MIN16_I2 iPxPos, FfxFloat32 fClip) +void StoreDepthClip(FfxInt32x2 iPxPos, FfxFloat32 fClip) { #if defined(FSR2_BIND_UAV_DEPTH_CLIP) imageStore(rw_depth_clip, iPxPos, vec4(fClip, 0.0f, 0.0f, 0.0f)); @@ -584,7 +551,7 @@ FfxFloat32 TanHalfFoV() return cbFSR2.fTanHalfFOV; } -FfxFloat32 LoadSceneDepth(FFX_MIN16_I2 iPxInput) +FfxFloat32 LoadSceneDepth(FfxInt32x2 iPxInput) { #if defined(FSR2_BIND_SRV_DEPTH) return texelFetch(r_depth, iPxInput, 0).r; @@ -593,35 +560,35 @@ FfxFloat32 LoadSceneDepth(FFX_MIN16_I2 iPxInput) #endif } -FfxFloat32 LoadReconstructedPrevDepth(FFX_MIN16_I2 iPxPos) +FfxFloat32 LoadReconstructedPrevDepth(FfxInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH) - return uintBitsToFloat(texelFetch(r_ReconstructedPrevNearestDepth, iPxPos, 0).r); + return uintBitsToFloat(texelFetch(r_reconstructed_previous_nearest_depth, iPxPos, 0).r); #else return 0.f; #endif } -void StoreReconstructedDepth(FFX_MIN16_I2 iPxSample, FfxFloat32 fDepth) +void StoreReconstructedDepth(FfxInt32x2 iPxSample, FfxFloat32 fDepth) { FfxUInt32 uDepth = floatBitsToUint(fDepth); #if defined(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH) #if FFX_FSR2_OPTION_INVERTED_DEPTH - imageAtomicMax(rw_ReconstructedPrevNearestDepth, iPxSample, uDepth); + imageAtomicMax(rw_reconstructed_previous_nearest_depth, iPxSample, uDepth); #else - imageAtomicMin(rw_ReconstructedPrevNearestDepth, iPxSample, uDepth); // min for standard, max for inverted depth + imageAtomicMin(rw_reconstructed_previous_nearest_depth, iPxSample, uDepth); // min for standard, max for inverted depth #endif #endif } -void SetReconstructedDepth(FFX_MIN16_I2 iPxSample, FfxUInt32 uValue) +void SetReconstructedDepth(FfxInt32x2 iPxSample, FfxUInt32 uValue) { #if defined(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH) - imageStore(rw_ReconstructedPrevNearestDepth, iPxSample, uvec4(uValue, 0, 0, 0)); + imageStore(rw_reconstructed_previous_nearest_depth, iPxSample, uvec4(uValue, 0, 0, 0)); #endif } -void StoreDilatedDepth(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FfxFloat32 fDepth) +void StoreDilatedDepth(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32 fDepth) { #if defined(FSR2_BIND_UAV_DILATED_DEPTH) //FfxUInt32 uDepth = f32tof16(fDepth); @@ -629,14 +596,14 @@ void StoreDilatedDepth(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN Ff #endif } -void StoreDilatedMotionVector(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fMotionVector) +void StoreDilatedMotionVector(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fMotionVector) { #if defined(FSR2_BIND_UAV_DILATED_MOTION_VECTORS) imageStore(rw_dilated_motion_vectors, iPxPos, vec4(fMotionVector, 0.0f, 0.0f)); #endif } -FfxFloat32x2 LoadDilatedMotionVector(FFX_MIN16_I2 iPxInput) +FfxFloat32x2 LoadDilatedMotionVector(FfxInt32x2 iPxInput) { #if defined(FSR2_BIND_SRV_DILATED_MOTION_VECTORS) return texelFetch(r_dilated_motion_vectors, iPxInput, 0).rg; @@ -655,7 +622,7 @@ FfxFloat32x2 SampleDilatedMotionVector(FfxFloat32x2 fUV) #endif } -FfxFloat32 LoadDilatedDepth(FFX_MIN16_I2 iPxInput) +FfxFloat32 LoadDilatedDepth(FfxInt32x2 iPxInput) { #if defined(FSR2_BIND_SRV_DILATED_DEPTH) return texelFetch(r_dilatedDepth, iPxInput, 0).r; @@ -688,41 +655,41 @@ FfxFloat32 SampleLanczos2Weight(FfxFloat32 x) #endif } -#if FFX_HALF -FfxFloat16 SampleLanczos2Weight(FfxFloat16 x) -{ -#if defined(FSR2_BIND_SRV_LANCZOS_LUT) - return FfxFloat16(textureLod(sampler2D(r_lanczos_lut, s_LinearClamp), FfxFloat32x2(x / 2.0f, 0.5f), 0.0f).x); -#else - return FfxFloat16(0.f); -#endif -} -#endif - -FFX_MIN16_F SampleUpsampleMaximumBias(FFX_MIN16_F2 uv) +FfxFloat32 SampleUpsampleMaximumBias(FfxFloat32x2 uv) { #if defined(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT) // Stored as a SNORM, so make sure to multiply by 2 to retrieve the actual expected range. - return FFX_MIN16_F(2.0f) * FFX_MIN16_F(textureLod(sampler2D(r_upsample_maximum_bias_lut, s_LinearClamp), abs(uv) * 2.0f, 0.0f).r); + return FfxFloat32(2.0f) * FfxFloat32(textureLod(sampler2D(r_upsample_maximum_bias_lut, s_LinearClamp), abs(uv) * 2.0f, 0.0f).r); #else - return FFX_MIN16_F(0.f); + return FfxFloat32(0.f); #endif } -FFX_MIN16_F LoadReactiveMax(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos) +FfxFloat32x2 SampleDilatedReactiveMasks(FfxFloat32x2 fUV) { -#if defined(FSR2_BIND_SRV_REACTIVE_MAX) - return FFX_MIN16_F(texelFetch(r_reactive_max, iPxPos, 0).r); +#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) + fUV *= cbFSR2.depthclip_uv_scale; // TODO: assuming these are (RenderSize() / MaxRenderSize()) + return textureLod(sampler2D(r_dilated_reactive_masks, s_LinearClamp), fUV, 0.0f).rg; #else - return FFX_MIN16_F(0.f); + return FfxFloat32x2(0.f); #endif } -void StoreReactiveMax(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F fReactiveMax) +FfxFloat32x2 LoadDilatedReactiveMasks(FFX_PARAMETER_IN FfxInt32x2 iPxPos) { -#if defined(FSR2_BIND_UAV_REACTIVE_MASK_MAX) - imageStore(rw_reactive_max, iPxPos, vec4(FfxFloat32(fReactiveMax), 0.0f, 0.0f, 0.0f)); +#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) + return texelFetch(r_dilated_reactive_masks, iPxPos, 0).rg; +#else + return FfxFloat32x2(0.f); #endif } +void StoreDilatedReactiveMasks(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fDilatedReactiveMasks) +{ +#if defined(FSR2_BIND_UAV_DILATED_REACTIVE_MASKS) + imageStore(rw_dilated_reactive_masks, iPxPos, vec4(fDilatedReactiveMasks, 0.0f, 0.0f)); +#endif +} + + #endif // #if defined(FFX_GPU) diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_callbacks_hlsl.h b/src/ffx-fsr2-api/shaders/ffx_fsr2_callbacks_hlsl.h index a04a949..646847e 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_callbacks_hlsl.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_callbacks_hlsl.h @@ -51,12 +51,11 @@ #if defined(FSR2_BIND_CB_FSR2) cbuffer cbFSR2 : FFX_FSR2_DECLARE_CB(FSR2_BIND_CB_FSR2) { - - FfxInt32x2 iRenderSize; - FfxInt32x2 iDisplaySize; - FfxUInt32x2 uLumaMipDimensions; - FfxUInt32 uLumaMipLevelToUse; - FfxUInt32 uFrameIndex; + FfxInt32x2 uRenderSize; + FfxInt32x2 uDisplaySize; + FfxInt32x2 uLumaMipDimensions; + FfxInt32 uLumaMipLevelToUse; + FfxUInt32 uFrameIndex; FfxFloat32x2 fDisplaySizeRcp; FfxFloat32x2 fJitter; FfxFloat32x4 fDeviceToViewDepth; @@ -65,23 +64,23 @@ FfxFloat32x2 reactive_mask_dim_rcp; FfxFloat32x2 MotionVectorScale; FfxFloat32x2 fDownscaleFactor; - FfxFloat32 fPreExposure; - FfxFloat32 fTanHalfFOV; + FfxFloat32 fPreExposure; + FfxFloat32 fTanHalfFOV; FfxFloat32x2 fMotionVectorJitterCancellation; - FfxFloat32 fJitterSequenceLength; - FfxFloat32 fLockInitialLifetime; - FfxFloat32 fLockTickDelta; - FfxFloat32 fDeltaTime; - FfxFloat32 fDynamicResChangeFactor; - FfxFloat32 fLumaMipRcp; + FfxFloat32 fJitterSequenceLength; + FfxFloat32 fLockInitialLifetime; + FfxFloat32 fLockTickDelta; + FfxFloat32 fDeltaTime; + FfxFloat32 fDynamicResChangeFactor; + FfxFloat32 fLumaMipRcp; #define FFX_FSR2_CONSTANT_BUFFER_1_SIZE 36 // Number of 32-bit values. This must be kept in sync with the cbFSR2 size. }; #else #define iRenderSize 0 #define iDisplaySize 0 - #define uLumaMipDimensions 0 - #define uLumaMipLevelToUse 0 - #define uFrameIndex 0 + #define iLumaMipDimensions 0 + #define iLumaMipLevelToUse 0 + #define iFrameIndex 0 #define fDisplaySizeRcp 0 #define fJitter 0 #define fDeviceToViewDepth FfxFloat32x4(0,0,0,0) @@ -153,12 +152,12 @@ FfxFloat32 LumaMipRcp() return fLumaMipRcp; } -uint2 LumaMipDimensions() +FfxInt32x2 LumaMipDimensions() { return uLumaMipDimensions; } -FfxUInt32 LumaMipLevelToUse() +FfxInt32 LumaMipLevelToUse() { return uLumaMipLevelToUse; } @@ -178,14 +177,14 @@ FfxFloat32x2 MotionVectorJitterCancellation() return fMotionVectorJitterCancellation; } -int2 RenderSize() +FfxInt32x2 RenderSize() { - return iRenderSize; + return uRenderSize; } -int2 DisplaySize() +FfxInt32x2 DisplaySize() { - return iDisplaySize; + return uDisplaySize; } FfxFloat32x2 DisplaySizeRcp() @@ -233,198 +232,171 @@ FfxUInt32 FrameIndex() SamplerState s_PointClamp : register(s0); SamplerState s_LinearClamp : register(s1); - -typedef FFX_MIN16_F4 PREPARED_INPUT_COLOR_T; -typedef FFX_MIN16_F3 PREPARED_INPUT_COLOR_F3; -typedef FFX_MIN16_F PREPARED_INPUT_COLOR_F1; - -typedef FfxFloat32x3 UPSAMPLED_COLOR_T; - -#define RW_UPSAMPLED_WEIGHT_T FfxFloat32 - -typedef FFX_MIN16_F3 LOCK_STATUS_T; -typedef FFX_MIN16_F LOCK_STATUS_F1; - // SRVs #if defined(FFX_INTERNAL) - Texture2D r_input_color_jittered : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR); - Texture2D r_motion_vectors : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_MOTION_VECTORS); - Texture2D r_depth : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_DEPTH); - Texture2D r_exposure : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_EXPOSURE); - Texture2D r_reactive_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK); - Texture2D r_transparency_and_composition_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_TRANSPARENCY_AND_COMPOSITION_MASK); - Texture2D r_ReconstructedPrevNearestDepth : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH); - Texture2D r_dilated_motion_vectors : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS); - Texture2D r_dilatedDepth : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH); - Texture2D r_internal_upscaled_color : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR); - Texture2D r_lock_status : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS); - Texture2D r_depth_clip : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DEPTH_CLIP); - Texture2D r_prepared_input_color : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR); - Texture2D r_luma_history : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY); - Texture2D r_rcas_input : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_RCAS_INPUT); - Texture2D r_lanczos_lut : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LANCZOS_LUT); - Texture2D r_imgMips : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE); - Texture2D r_upsample_maximum_bias_lut : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTITIER_UPSAMPLE_MAXIMUM_BIAS_LUT); - Texture2D r_reactive_max : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_REACTIVE_MAX); + Texture2D r_input_color_jittered : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_COLOR); + Texture2D r_motion_vectors : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_MOTION_VECTORS); + Texture2D r_depth : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_DEPTH); + Texture2D r_exposure : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_EXPOSURE); + Texture2D r_reactive_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_REACTIVE_MASK); + Texture2D r_transparency_and_composition_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INPUT_TRANSPARENCY_AND_COMPOSITION_MASK); + Texture2D r_reconstructed_previous_nearest_depth : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH); + Texture2D r_dilated_motion_vectors : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS); + Texture2D r_dilatedDepth : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH); + Texture2D r_internal_upscaled_color : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR); + Texture2D r_lock_status : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS); + Texture2D r_depth_clip : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DEPTH_CLIP); + Texture2D r_prepared_input_color : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR); + Texture2D r_luma_history : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY); + Texture2D r_rcas_input : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_RCAS_INPUT); + Texture2D r_lanczos_lut : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_LANCZOS_LUT); + Texture2D r_imgMips : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE); + Texture2D r_upsample_maximum_bias_lut : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTITIER_UPSAMPLE_MAXIMUM_BIAS_LUT); + Texture2D r_dilated_reactive_masks : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS); + Texture2D r_debug_out : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DEBUG_OUTPUT); // declarations not current form, no accessor functions - Texture2D r_transparency_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_TRANSPARENCY_MASK); - Texture2D r_bias_current_color_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_BIAS_CURRENT_COLOR_MASK); - Texture2D r_gbuffer_albedo : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_ALBEDO); - Texture2D r_gbuffer_roughness : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_ROUGHNESS); - Texture2D r_gbuffer_metallic : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_METALLIC); - Texture2D r_gbuffer_specular : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_SPECULAR); - Texture2D r_gbuffer_subsurface : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_SUBSURFACE); - Texture2D r_gbuffer_normals : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_NORMALS); - Texture2D r_gbuffer_shading_mode_id : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_SHADING_MODE_ID); - Texture2D r_gbuffer_material_id : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_MATERIAL_ID); - Texture2D r_motion_vectors_3d : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_VELOCITY_3D); - Texture2D r_is_particle_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_IS_PARTICLE_MASK); - Texture2D r_animated_texture_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_ANIMATED_TEXTURE_MASK); - Texture2D r_depth_high_res : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DEPTH_HIGH_RES); - Texture2D r_position_view_space : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_POSITION_VIEW_SPACE); - Texture2D r_ray_tracing_hit_distance : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_RAY_TRACING_HIT_DISTANCE); - Texture2D r_motion_vectors_reflection : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_VELOCITY_REFLECTION); + Texture2D r_transparency_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_TRANSPARENCY_MASK); + Texture2D r_bias_current_color_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_BIAS_CURRENT_COLOR_MASK); + Texture2D r_gbuffer_albedo : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_ALBEDO); + Texture2D r_gbuffer_roughness : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_ROUGHNESS); + Texture2D r_gbuffer_metallic : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_METALLIC); + Texture2D r_gbuffer_specular : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_SPECULAR); + Texture2D r_gbuffer_subsurface : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_SUBSURFACE); + Texture2D r_gbuffer_normals : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_NORMALS); + Texture2D r_gbuffer_shading_mode_id : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_SHADING_MODE_ID); + Texture2D r_gbuffer_material_id : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_GBUFFER_MATERIAL_ID); + Texture2D r_motion_vectors_3d : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_VELOCITY_3D); + Texture2D r_is_particle_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_IS_PARTICLE_MASK); + Texture2D r_animated_texture_mask : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_ANIMATED_TEXTURE_MASK); + Texture2D r_depth_high_res : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_DEPTH_HIGH_RES); + Texture2D r_position_view_space : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_POSITION_VIEW_SPACE); + Texture2D r_ray_tracing_hit_distance : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_RAY_TRACING_HIT_DISTANCE); + Texture2D r_motion_vectors_reflection : FFX_FSR2_DECLARE_SRV(FFX_FSR2_RESOURCE_IDENTIFIER_VELOCITY_REFLECTION); // UAV declarations - RWTexture2D rw_ReconstructedPrevNearestDepth : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH); - RWTexture2D rw_dilated_motion_vectors : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS); - RWTexture2D rw_dilatedDepth : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH); - RWTexture2D rw_internal_upscaled_color : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR); - RWTexture2D rw_lock_status : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS); - RWTexture2D rw_depth_clip : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DEPTH_CLIP); - RWTexture2D rw_prepared_input_color : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR); - RWTexture2D rw_luma_history : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY); - RWTexture2D rw_upscaled_output : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_UPSCALED_OUTPUT); - //globallycoherent RWTexture2D rw_imgMipmap[13] : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE); - globallycoherent RWTexture2D rw_img_mip_shading_change : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_SHADING_CHANGE); - globallycoherent RWTexture2D rw_img_mip_5 : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_5); - RWTexture2D rw_reactive_max : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_REACTIVE_MAX); - RWTexture2D rw_exposure : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_EXPOSURE); - globallycoherent RWTexture2D rw_spd_global_atomic : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_SPD_ATOMIC_COUNT); - RWTexture2D rw_debug_out : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DEBUG_OUTPUT); + RWTexture2D rw_reconstructed_previous_nearest_depth : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_RECONSTRUCTED_PREVIOUS_NEAREST_DEPTH); + RWTexture2D rw_dilated_motion_vectors : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_MOTION_VECTORS); + RWTexture2D rw_dilatedDepth : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_DEPTH); + RWTexture2D rw_internal_upscaled_color : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_UPSCALED_COLOR); + RWTexture2D rw_lock_status : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_LOCK_STATUS); + RWTexture2D rw_depth_clip : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DEPTH_CLIP); + RWTexture2D rw_prepared_input_color : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_PREPARED_INPUT_COLOR); + RWTexture2D rw_luma_history : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_LUMA_HISTORY); + RWTexture2D rw_upscaled_output : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_UPSCALED_OUTPUT); + + globallycoherent RWTexture2D rw_img_mip_shading_change : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_SHADING_CHANGE); + globallycoherent RWTexture2D rw_img_mip_5 : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_5); + RWTexture2D rw_dilated_reactive_masks : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS); + RWTexture2D rw_exposure : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_EXPOSURE); + globallycoherent RWTexture2D rw_spd_global_atomic : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_SPD_ATOMIC_COUNT); + RWTexture2D rw_debug_out : FFX_FSR2_DECLARE_UAV(FFX_FSR2_RESOURCE_IDENTIFIER_DEBUG_OUTPUT); #else // #if defined(FFX_INTERNAL) #if defined FSR2_BIND_SRV_INPUT_COLOR - Texture2D r_input_color_jittered : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INPUT_COLOR); + Texture2D r_input_color_jittered : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INPUT_COLOR); #endif #if defined FSR2_BIND_SRV_MOTION_VECTORS - Texture2D r_motion_vectors : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_MOTION_VECTORS); + Texture2D r_motion_vectors : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_MOTION_VECTORS); #endif #if defined FSR2_BIND_SRV_DEPTH - Texture2D r_depth : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DEPTH); + Texture2D r_depth : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DEPTH); #endif #if defined FSR2_BIND_SRV_EXPOSURE - Texture2D r_exposure : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_EXPOSURE); + Texture2D r_exposure : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_EXPOSURE); #endif #if defined FSR2_BIND_SRV_REACTIVE_MASK - Texture2D r_reactive_mask : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_REACTIVE_MASK); + Texture2D r_reactive_mask : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_REACTIVE_MASK); #endif #if defined FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK - Texture2D r_transparency_and_composition_mask : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK); + Texture2D r_transparency_and_composition_mask : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK); #endif #if defined FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH - Texture2D r_ReconstructedPrevNearestDepth : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH); + Texture2D r_reconstructed_previous_nearest_depth : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH); #endif #if defined FSR2_BIND_SRV_DILATED_MOTION_VECTORS - Texture2D r_dilated_motion_vectors : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DILATED_MOTION_VECTORS); + Texture2D r_dilated_motion_vectors : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DILATED_MOTION_VECTORS); #endif #if defined FSR2_BIND_SRV_DILATED_DEPTH - Texture2D r_dilatedDepth : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DILATED_DEPTH); + Texture2D r_dilatedDepth : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DILATED_DEPTH); #endif #if defined FSR2_BIND_SRV_INTERNAL_UPSCALED - Texture2D r_internal_upscaled_color : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INTERNAL_UPSCALED); + Texture2D r_internal_upscaled_color : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_INTERNAL_UPSCALED); #endif #if defined FSR2_BIND_SRV_LOCK_STATUS - #if FFX_COMPILE_FOR_SPIRV - Texture2D r_lock_status : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LOCK_STATUS); - #else - Texture2D r_lock_status : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LOCK_STATUS); - #endif + Texture2D r_lock_status : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LOCK_STATUS); #endif #if defined FSR2_BIND_SRV_DEPTH_CLIP - Texture2D r_depth_clip : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DEPTH_CLIP); + Texture2D r_depth_clip : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DEPTH_CLIP); #endif #if defined FSR2_BIND_SRV_PREPARED_INPUT_COLOR - #if FFX_COMPILE_FOR_SPIRV - Texture2D r_prepared_input_color : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_PREPARED_INPUT_COLOR); - #else - Texture2D r_prepared_input_color : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_PREPARED_INPUT_COLOR); - #endif + Texture2D r_prepared_input_color : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_PREPARED_INPUT_COLOR); #endif #if defined FSR2_BIND_SRV_LUMA_HISTORY - Texture2D r_luma_history : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LUMA_HISTORY); + Texture2D r_luma_history : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LUMA_HISTORY); #endif #if defined FSR2_BIND_SRV_RCAS_INPUT - Texture2D r_rcas_input : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_RCAS_INPUT); + Texture2D r_rcas_input : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_RCAS_INPUT); #endif #if defined FSR2_BIND_SRV_LANCZOS_LUT - Texture2D r_lanczos_lut : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LANCZOS_LUT); + Texture2D r_lanczos_lut : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_LANCZOS_LUT); #endif #if defined FSR2_BIND_SRV_EXPOSURE_MIPS - Texture2D r_imgMips : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_EXPOSURE_MIPS); + Texture2D r_imgMips : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_EXPOSURE_MIPS); #endif #if defined FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT - Texture2D r_upsample_maximum_bias_lut : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT); + Texture2D r_upsample_maximum_bias_lut : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT); #endif - #if defined FSR2_BIND_SRV_REACTIVE_MAX - Texture2D r_reactive_max : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_REACTIVE_MAX); + #if defined FSR2_BIND_SRV_DILATED_REACTIVE_MASKS + Texture2D r_dilated_reactive_masks : FFX_FSR2_DECLARE_SRV(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS); #endif // UAV declarations #if defined FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH - RWTexture2D rw_ReconstructedPrevNearestDepth : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH); + RWTexture2D rw_reconstructed_previous_nearest_depth : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH); #endif #if defined FSR2_BIND_UAV_DILATED_MOTION_VECTORS - RWTexture2D rw_dilated_motion_vectors : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_DILATED_MOTION_VECTORS); + RWTexture2D rw_dilated_motion_vectors : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_DILATED_MOTION_VECTORS); #endif #if defined FSR2_BIND_UAV_DILATED_DEPTH - RWTexture2D rw_dilatedDepth : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_DILATED_DEPTH); + RWTexture2D rw_dilatedDepth : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_DILATED_DEPTH); #endif #if defined FSR2_BIND_UAV_INTERNAL_UPSCALED - RWTexture2D rw_internal_upscaled_color : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_INTERNAL_UPSCALED); + RWTexture2D rw_internal_upscaled_color : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_INTERNAL_UPSCALED); #endif #if defined FSR2_BIND_UAV_LOCK_STATUS - #if FFX_COMPILE_FOR_SPIRV - RWTexture2D rw_lock_status : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_LOCK_STATUS); - #else - RWTexture2D rw_lock_status : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_LOCK_STATUS); - #endif + RWTexture2D rw_lock_status : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_LOCK_STATUS); #endif #if defined FSR2_BIND_UAV_DEPTH_CLIP - RWTexture2D rw_depth_clip : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_DEPTH_CLIP); + RWTexture2D rw_depth_clip : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_DEPTH_CLIP); #endif #if defined FSR2_BIND_UAV_PREPARED_INPUT_COLOR - #if FFX_COMPILE_FOR_SPIRV - RWTexture2D rw_prepared_input_color : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_PREPARED_INPUT_COLOR); - #else - RWTexture2D rw_prepared_input_color : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_PREPARED_INPUT_COLOR); - #endif + RWTexture2D rw_prepared_input_color : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_PREPARED_INPUT_COLOR); #endif #if defined FSR2_BIND_UAV_LUMA_HISTORY - RWTexture2D rw_luma_history : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_LUMA_HISTORY); + RWTexture2D rw_luma_history : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_LUMA_HISTORY); #endif #if defined FSR2_BIND_UAV_UPSCALED_OUTPUT - RWTexture2D rw_upscaled_output : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_UPSCALED_OUTPUT); + RWTexture2D rw_upscaled_output : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_UPSCALED_OUTPUT); #endif #if defined FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE - globallycoherent RWTexture2D rw_img_mip_shading_change : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE); + globallycoherent RWTexture2D rw_img_mip_shading_change : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_EXPOSURE_MIP_LUMA_CHANGE); #endif #if defined FSR2_BIND_UAV_EXPOSURE_MIP_5 - globallycoherent RWTexture2D rw_img_mip_5 : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_EXPOSURE_MIP_5); + globallycoherent RWTexture2D rw_img_mip_5 : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_EXPOSURE_MIP_5); #endif - #if defined FSR2_BIND_UAV_REACTIVE_MASK_MAX - RWTexture2D rw_reactive_max : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_REACTIVE_MASK_MAX); + #if defined FSR2_BIND_UAV_DILATED_REACTIVE_MASKS + RWTexture2D rw_dilated_reactive_masks : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_DILATED_REACTIVE_MASKS); #endif #if defined FSR2_BIND_UAV_EXPOSURE - RWTexture2D rw_exposure : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_EXPOSURE); + RWTexture2D rw_exposure : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_EXPOSURE); #endif #if defined FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC - globallycoherent RWTexture2D rw_spd_global_atomic : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC); + globallycoherent RWTexture2D rw_spd_global_atomic : FFX_FSR2_DECLARE_UAV(FSR2_BIND_UAV_SPD_GLOBAL_ATOMIC); #endif #endif // #if defined(FFX_INTERNAL) -FfxFloat32 LoadMipLuma(FFX_MIN16_I2 iPxPos, FfxUInt32 mipLevel) +FfxFloat32 LoadMipLuma(FfxUInt32x2 iPxPos, FfxUInt32 mipLevel) { #if defined(FSR2_BIND_SRV_EXPOSURE_MIPS) || defined(FFX_INTERNAL) return r_imgMips.mips[mipLevel][iPxPos]; @@ -432,16 +404,7 @@ FfxFloat32 LoadMipLuma(FFX_MIN16_I2 iPxPos, FfxUInt32 mipLevel) return 0.f; #endif } -#if FFX_HALF -FfxFloat16 LoadMipLuma(FfxInt16x2 iPxPos, FfxUInt16 mipLevel) -{ -#if defined(FSR2_BIND_SRV_EXPOSURE_MIPS) || defined(FFX_INTERNAL) - return r_imgMips.mips[mipLevel][iPxPos]; -#else - return 0.f; -#endif -} -#endif + FfxFloat32 SampleMipLuma(FfxFloat32x2 fUV, FfxUInt32 mipLevel) { #if defined(FSR2_BIND_SRV_EXPOSURE_MIPS) || defined(FFX_INTERNAL) @@ -452,17 +415,6 @@ FfxFloat32 SampleMipLuma(FfxFloat32x2 fUV, FfxUInt32 mipLevel) #endif } -#if FFX_HALF -FfxFloat16 SampleMipLuma(FfxFloat16x2 fUV, FfxUInt32 mipLevel) -{ -#if defined(FSR2_BIND_SRV_EXPOSURE_MIPS) || defined(FFX_INTERNAL) - fUV *= FfxFloat16x2(depthclip_uv_scale); - return r_imgMips.SampleLevel(s_LinearClamp, fUV, mipLevel); -#else - return 0.f; -#endif -} -#endif // // a 0 0 0 x @@ -482,7 +434,7 @@ FfxFloat32 ConvertFromDeviceDepthToViewSpace(FfxFloat32 fDeviceDepth) return -fDeviceToViewDepth[2] / (fDeviceDepth * fDeviceToViewDepth[1] - fDeviceToViewDepth[0]); } -FfxFloat32 LoadInputDepth(FFX_MIN16_I2 iPxPos) +FfxFloat32 LoadInputDepth(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_DEPTH) || defined(FFX_INTERNAL) return r_depth[iPxPos]; @@ -491,7 +443,7 @@ FfxFloat32 LoadInputDepth(FFX_MIN16_I2 iPxPos) #endif } -FFX_MIN16_F LoadReactiveMask(FFX_MIN16_I2 iPxPos) +FfxFloat32 LoadReactiveMask(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_REACTIVE_MASK) || defined(FFX_INTERNAL) return r_reactive_mask[iPxPos]; @@ -500,7 +452,7 @@ FFX_MIN16_F LoadReactiveMask(FFX_MIN16_I2 iPxPos) #endif } -FfxFloat32x4 GatherReactiveMask(int2 iPxPos) +FfxFloat32x4 GatherReactiveMask(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_REACTIVE_MASK) || defined(FFX_INTERNAL) return r_reactive_mask.GatherRed(s_LinearClamp, FfxFloat32x2(iPxPos) * reactive_mask_dim_rcp); @@ -509,7 +461,7 @@ FfxFloat32x4 GatherReactiveMask(int2 iPxPos) #endif } -FFX_MIN16_F LoadTransparencyAndCompositionMask(FFX_MIN16_I2 iPxPos) +FfxFloat32 LoadTransparencyAndCompositionMask(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK) || defined(FFX_INTERNAL) return r_transparency_and_composition_mask[iPxPos]; @@ -518,12 +470,22 @@ FFX_MIN16_F LoadTransparencyAndCompositionMask(FFX_MIN16_I2 iPxPos) #endif } +FfxFloat32 SampleTransparencyAndCompositionMask(FfxFloat32x2 fUV) +{ +#if defined(FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK) || defined(FFX_INTERNAL) + fUV *= depthclip_uv_scale; + return r_transparency_and_composition_mask.SampleLevel(s_LinearClamp, fUV, 0); +#else + return 0.f; +#endif +} + FfxFloat32 PreExposure() { return fPreExposure; } -FfxFloat32x3 LoadInputColor(FFX_MIN16_I2 iPxPos) +FfxFloat32x3 LoadInputColor(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_INPUT_COLOR) || defined(FFX_INTERNAL) return r_input_color_jittered[iPxPos].rgb / PreExposure(); @@ -532,7 +494,7 @@ FfxFloat32x3 LoadInputColor(FFX_MIN16_I2 iPxPos) #endif } -FfxFloat32x3 LoadInputColorWithoutPreExposure(FFX_MIN16_I2 iPxPos) +FfxFloat32x3 LoadInputColorWithoutPreExposure(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_INPUT_COLOR) || defined(FFX_INTERNAL) return r_input_color_jittered[iPxPos].rgb; @@ -541,18 +503,7 @@ FfxFloat32x3 LoadInputColorWithoutPreExposure(FFX_MIN16_I2 iPxPos) #endif } -#if FFX_HALF -FfxFloat16x3 LoadPreparedInputColor(FfxInt16x2 iPxPos) -{ -#if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) || defined(FFX_INTERNAL) - return r_prepared_input_color[iPxPos].rgb; -#else - return 0.f; -#endif -} -#endif - -FFX_MIN16_F3 LoadPreparedInputColor(int2 iPxPos) +FfxFloat32x3 LoadPreparedInputColor(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) || defined(FFX_INTERNAL) return r_prepared_input_color[iPxPos].rgb; @@ -561,7 +512,7 @@ FFX_MIN16_F3 LoadPreparedInputColor(int2 iPxPos) #endif } -FFX_MIN16_F LoadPreparedInputColorLuma(FFX_MIN16_I2 iPxPos) +FfxFloat32 LoadPreparedInputColorLuma(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_PREPARED_INPUT_COLOR) || defined(FFX_INTERNAL) return r_prepared_input_color[iPxPos].a; @@ -570,7 +521,7 @@ FFX_MIN16_F LoadPreparedInputColorLuma(FFX_MIN16_I2 iPxPos) #endif } -FfxFloat32x2 LoadInputMotionVector(FFX_MIN16_I2 iPxDilatedMotionVectorPos) +FfxFloat32x2 LoadInputMotionVector(FfxUInt32x2 iPxDilatedMotionVectorPos) { #if defined(FSR2_BIND_SRV_MOTION_VECTORS) || defined(FFX_INTERNAL) FfxFloat32x2 fSrcMotionVector = r_motion_vectors[iPxDilatedMotionVectorPos].xy; @@ -587,7 +538,7 @@ FfxFloat32x2 LoadInputMotionVector(FFX_MIN16_I2 iPxDilatedMotionVectorPos) return fUvMotionVector; } -FFX_MIN16_F4 LoadHistory(int2 iPxHistory) +FfxFloat32x4 LoadHistory(FfxUInt32x2 iPxHistory) { #if defined(FSR2_BIND_SRV_INTERNAL_UPSCALED) || defined(FFX_INTERNAL) return r_internal_upscaled_color[iPxHistory]; @@ -596,7 +547,7 @@ FFX_MIN16_F4 LoadHistory(int2 iPxHistory) #endif } -FfxFloat32x4 LoadRwInternalUpscaledColorAndWeight(FFX_MIN16_I2 iPxPos) +FfxFloat32x4 LoadRwInternalUpscaledColorAndWeight(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED) || defined(FFX_INTERNAL) return rw_internal_upscaled_color[iPxPos]; @@ -605,14 +556,14 @@ FfxFloat32x4 LoadRwInternalUpscaledColorAndWeight(FFX_MIN16_I2 iPxPos) #endif } -void StoreLumaHistory(FFX_MIN16_I2 iPxPos, FfxFloat32x4 fLumaHistory) +void StoreLumaHistory(FfxUInt32x2 iPxPos, FfxFloat32x4 fLumaHistory) { #if defined(FSR2_BIND_UAV_LUMA_HISTORY) || defined(FFX_INTERNAL) rw_luma_history[iPxPos] = fLumaHistory; #endif } -FfxFloat32x4 LoadRwLumaHistory(FFX_MIN16_I2 iPxPos) +FfxFloat32x4 LoadRwLumaHistory(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_UAV_LUMA_HISTORY) || defined(FFX_INTERNAL) return rw_luma_history[iPxPos]; @@ -621,7 +572,7 @@ FfxFloat32x4 LoadRwLumaHistory(FFX_MIN16_I2 iPxPos) #endif } -FfxFloat32 LoadLumaStabilityFactor(FFX_MIN16_I2 iPxPos) +FfxFloat32 LoadLumaStabilityFactor(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_LUMA_HISTORY) || defined(FFX_INTERNAL) return r_luma_history[iPxPos].w; @@ -640,21 +591,21 @@ FfxFloat32 SampleLumaStabilityFactor(FfxFloat32x2 fUV) #endif } -void StoreReprojectedHistory(FFX_MIN16_I2 iPxHistory, FFX_MIN16_F4 fHistory) +void StoreReprojectedHistory(FfxUInt32x2 iPxHistory, FfxFloat32x4 fHistory) { #if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED) || defined(FFX_INTERNAL) rw_internal_upscaled_color[iPxHistory] = fHistory; #endif } -void StoreInternalColorAndWeight(FFX_MIN16_I2 iPxPos, FfxFloat32x4 fColorAndWeight) +void StoreInternalColorAndWeight(FfxUInt32x2 iPxPos, FfxFloat32x4 fColorAndWeight) { #if defined(FSR2_BIND_UAV_INTERNAL_UPSCALED) || defined(FFX_INTERNAL) rw_internal_upscaled_color[iPxPos] = fColorAndWeight; #endif } -void StoreUpscaledOutput(FFX_MIN16_I2 iPxPos, FfxFloat32x3 fColor) +void StoreUpscaledOutput(FfxUInt32x2 iPxPos, FfxFloat32x3 fColor) { #if defined(FSR2_BIND_UAV_UPSCALED_OUTPUT) || defined(FFX_INTERNAL) rw_upscaled_output[iPxPos] = FfxFloat32x4(fColor * PreExposure(), 1.f); @@ -663,10 +614,10 @@ void StoreUpscaledOutput(FFX_MIN16_I2 iPxPos, FfxFloat32x3 fColor) //LOCK_LIFETIME_REMAINING == 0 //Should make LockInitialLifetime() return a const 1.0f later -LOCK_STATUS_T LoadLockStatus(FFX_MIN16_I2 iPxPos) +FfxFloat32x3 LoadLockStatus(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_LOCK_STATUS) || defined(FFX_INTERNAL) - LOCK_STATUS_T fLockStatus = r_lock_status[iPxPos]; + FfxFloat32x3 fLockStatus = r_lock_status[iPxPos]; fLockStatus[0] -= LockInitialLifetime() * 2.0f; return fLockStatus; @@ -677,10 +628,10 @@ LOCK_STATUS_T LoadLockStatus(FFX_MIN16_I2 iPxPos) } -LOCK_STATUS_T LoadRwLockStatus(int2 iPxPos) +FfxFloat32x3 LoadRwLockStatus(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_UAV_LOCK_STATUS) || defined(FFX_INTERNAL) - LOCK_STATUS_T fLockStatus = rw_lock_status[iPxPos]; + FfxFloat32x3 fLockStatus = rw_lock_status[iPxPos]; fLockStatus[0] -= LockInitialLifetime() * 2.0f; @@ -690,7 +641,7 @@ LOCK_STATUS_T LoadRwLockStatus(int2 iPxPos) #endif } -void StoreLockStatus(FFX_MIN16_I2 iPxPos, LOCK_STATUS_T fLockstatus) +void StoreLockStatus(FfxUInt32x2 iPxPos, FfxFloat32x3 fLockstatus) { #if defined(FSR2_BIND_UAV_LOCK_STATUS) || defined(FFX_INTERNAL) fLockstatus[0] += LockInitialLifetime() * 2.0f; @@ -699,19 +650,19 @@ void StoreLockStatus(FFX_MIN16_I2 iPxPos, LOCK_STATUS_T fLockstatus) #endif } -void StorePreparedInputColor(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN PREPARED_INPUT_COLOR_T fTonemapped) +void StorePreparedInputColor(FFX_PARAMETER_IN FfxUInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x4 fTonemapped) { #if defined(FSR2_BIND_UAV_PREPARED_INPUT_COLOR) || defined(FFX_INTERNAL) rw_prepared_input_color[iPxPos] = fTonemapped; #endif } -FfxBoolean IsResponsivePixel(FFX_MIN16_I2 iPxPos) +FfxBoolean IsResponsivePixel(FfxUInt32x2 iPxPos) { return FFX_FALSE; //not supported in prototype } -FfxFloat32 LoadDepthClip(FFX_MIN16_I2 iPxPos) +FfxFloat32 LoadDepthClip(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_DEPTH_CLIP) || defined(FFX_INTERNAL) return r_depth_clip[iPxPos]; @@ -730,11 +681,11 @@ FfxFloat32 SampleDepthClip(FfxFloat32x2 fUV) #endif } -LOCK_STATUS_T SampleLockStatus(FfxFloat32x2 fUV) +FfxFloat32x3 SampleLockStatus(FfxFloat32x2 fUV) { #if defined(FSR2_BIND_SRV_LOCK_STATUS) || defined(FFX_INTERNAL) fUV *= postprocessed_lockstatus_uv_scale; - LOCK_STATUS_T fLockStatus = r_lock_status.SampleLevel(s_LinearClamp, fUV, 0); + FfxFloat32x3 fLockStatus = r_lock_status.SampleLevel(s_LinearClamp, fUV, 0); fLockStatus[0] -= LockInitialLifetime() * 2.0f; return fLockStatus; #else @@ -742,7 +693,7 @@ LOCK_STATUS_T SampleLockStatus(FfxFloat32x2 fUV) #endif } -void StoreDepthClip(FFX_MIN16_I2 iPxPos, FfxFloat32 fClip) +void StoreDepthClip(FfxUInt32x2 iPxPos, FfxFloat32 fClip) { #if defined(FSR2_BIND_UAV_DEPTH_CLIP) || defined(FFX_INTERNAL) rw_depth_clip[iPxPos] = fClip; @@ -754,7 +705,7 @@ FfxFloat32 TanHalfFoV() return fTanHalfFOV; } -FfxFloat32 LoadSceneDepth(FFX_MIN16_I2 iPxInput) +FfxFloat32 LoadSceneDepth(FfxUInt32x2 iPxInput) { #if defined(FSR2_BIND_SRV_DEPTH) || defined(FFX_INTERNAL) return r_depth[iPxInput]; @@ -763,50 +714,49 @@ FfxFloat32 LoadSceneDepth(FFX_MIN16_I2 iPxInput) #endif } -FfxFloat32 LoadReconstructedPrevDepth(FFX_MIN16_I2 iPxPos) +FfxFloat32 LoadReconstructedPrevDepth(FfxUInt32x2 iPxPos) { #if defined(FSR2_BIND_SRV_RECONSTRUCTED_PREV_NEAREST_DEPTH) || defined(FFX_INTERNAL) - return asfloat(r_ReconstructedPrevNearestDepth[iPxPos]); + return asfloat(r_reconstructed_previous_nearest_depth[iPxPos]); #else return 0; #endif } -void StoreReconstructedDepth(FFX_MIN16_I2 iPxSample, FfxFloat32 fDepth) +void StoreReconstructedDepth(FfxUInt32x2 iPxSample, FfxFloat32 fDepth) { FfxUInt32 uDepth = asuint(fDepth); #if defined(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH) || defined(FFX_INTERNAL) #if FFX_FSR2_OPTION_INVERTED_DEPTH - InterlockedMax(rw_ReconstructedPrevNearestDepth[iPxSample], uDepth); + InterlockedMax(rw_reconstructed_previous_nearest_depth[iPxSample], uDepth); #else - InterlockedMin(rw_ReconstructedPrevNearestDepth[iPxSample], uDepth); // min for standard, max for inverted depth + InterlockedMin(rw_reconstructed_previous_nearest_depth[iPxSample], uDepth); // min for standard, max for inverted depth #endif #endif } -void SetReconstructedDepth(FFX_MIN16_I2 iPxSample, const FfxUInt32 uValue) +void SetReconstructedDepth(FfxUInt32x2 iPxSample, const FfxUInt32 uValue) { #if defined(FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH) || defined(FFX_INTERNAL) - rw_ReconstructedPrevNearestDepth[iPxSample] = uValue; + rw_reconstructed_previous_nearest_depth[iPxSample] = uValue; #endif } -void StoreDilatedDepth(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FfxFloat32 fDepth) +void StoreDilatedDepth(FFX_PARAMETER_IN FfxUInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32 fDepth) { #if defined(FSR2_BIND_UAV_DILATED_DEPTH) || defined(FFX_INTERNAL) - //FfxUInt32 uDepth = f32tof16(fDepth); rw_dilatedDepth[iPxPos] = fDepth; #endif } -void StoreDilatedMotionVector(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fMotionVector) +void StoreDilatedMotionVector(FFX_PARAMETER_IN FfxUInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fMotionVector) { #if defined(FSR2_BIND_UAV_DILATED_MOTION_VECTORS) || defined(FFX_INTERNAL) rw_dilated_motion_vectors[iPxPos] = fMotionVector; #endif } -FfxFloat32x2 LoadDilatedMotionVector(FFX_MIN16_I2 iPxInput) +FfxFloat32x2 LoadDilatedMotionVector(FfxUInt32x2 iPxInput) { #if defined(FSR2_BIND_SRV_DILATED_MOTION_VECTORS) || defined(FFX_INTERNAL) return r_dilated_motion_vectors[iPxInput].xy; @@ -825,7 +775,7 @@ FfxFloat32x2 SampleDilatedMotionVector(FfxFloat32x2 fUV) #endif } -FfxFloat32 LoadDilatedDepth(FFX_MIN16_I2 iPxInput) +FfxFloat32 LoadDilatedDepth(FfxUInt32x2 iPxInput) { #if defined(FSR2_BIND_SRV_DILATED_DEPTH) || defined(FFX_INTERNAL) return r_dilatedDepth[iPxInput]; @@ -838,7 +788,7 @@ FfxFloat32 Exposure() { // return 1.0f; #if defined(FSR2_BIND_SRV_EXPOSURE) || defined(FFX_INTERNAL) - FfxFloat32 exposure = r_exposure[FFX_MIN16_I2(0, 0)].x; + FfxFloat32 exposure = r_exposure[FfxUInt32x2(0, 0)].x; #else FfxFloat32 exposure = 1.f; #endif @@ -859,40 +809,39 @@ FfxFloat32 SampleLanczos2Weight(FfxFloat32 x) #endif } -#if FFX_HALF -FfxFloat16 SampleLanczos2Weight(FfxFloat16 x) -{ -#if defined(FSR2_BIND_SRV_LANCZOS_LUT) || defined(FFX_INTERNAL) - return r_lanczos_lut.SampleLevel(s_LinearClamp, FfxFloat16x2(x / 2, 0.5f), 0); -#else - return 0.f; -#endif -} -#endif - -FFX_MIN16_F SampleUpsampleMaximumBias(FFX_MIN16_F2 uv) +FfxFloat32 SampleUpsampleMaximumBias(FfxFloat32x2 uv) { #if defined(FSR2_BIND_SRV_UPSCALE_MAXIMUM_BIAS_LUT) || defined(FFX_INTERNAL) // Stored as a SNORM, so make sure to multiply by 2 to retrieve the actual expected range. - return FFX_MIN16_F(2.0) * r_upsample_maximum_bias_lut.SampleLevel(s_LinearClamp, abs(uv) * 2.0, 0); + return FfxFloat32(2.0) * r_upsample_maximum_bias_lut.SampleLevel(s_LinearClamp, abs(uv) * 2.0, 0); #else return 0.f; #endif } -FFX_MIN16_F LoadReactiveMax(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos) +FfxFloat32x2 SampleDilatedReactiveMasks(FfxFloat32x2 fUV) { -#if defined(FSR2_BIND_SRV_REACTIVE_MAX) || defined(FFX_INTERNAL) - return r_reactive_max[iPxPos]; +#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) || defined(FFX_INTERNAL) + fUV *= depthclip_uv_scale; + return r_dilated_reactive_masks.SampleLevel(s_LinearClamp, fUV, 0); +#else + return 0.f; +#endif +} + +FfxFloat32x2 LoadDilatedReactiveMasks(FFX_PARAMETER_IN FfxUInt32x2 iPxPos) +{ +#if defined(FSR2_BIND_SRV_DILATED_REACTIVE_MASKS) || defined(FFX_INTERNAL) + return r_dilated_reactive_masks[iPxPos]; #else return 0.f; #endif } -void StoreReactiveMax(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_F fReactiveMax) +void StoreDilatedReactiveMasks(FFX_PARAMETER_IN FfxUInt32x2 iPxPos, FFX_PARAMETER_IN FfxFloat32x2 fDilatedReactiveMasks) { -#if defined(FSR2_BIND_UAV_REACTIVE_MASK_MAX) || defined(FFX_INTERNAL) - rw_reactive_max[iPxPos] = fReactiveMax; +#if defined(FSR2_BIND_UAV_DILATED_REACTIVE_MASKS) || defined(FFX_INTERNAL) + rw_dilated_reactive_masks[iPxPos] = fDilatedReactiveMasks; #endif } diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_common.h b/src/ffx-fsr2-api/shaders/ffx_fsr2_common.h index 7be6631..7f6acf2 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_common.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_common.h @@ -58,135 +58,160 @@ struct LockState FfxBoolean WasLockedPrevFrame; //Set to identify if the pixel was already locked (relock) }; -FFX_MIN16_F GetNormalizedRemainingLockLifetime(FFX_MIN16_F3 fLockStatus) +FfxFloat32 GetNormalizedRemainingLockLifetime(FfxFloat32x3 fLockStatus) { const FfxFloat32 fTrust = fLockStatus[LOCK_TRUST]; - return FFX_MIN16_F(((ffxSaturate(fLockStatus[LOCK_LIFETIME_REMAINING] - LockInitialLifetime()) / LockInitialLifetime())) * fTrust); + return ffxSaturate(fLockStatus[LOCK_LIFETIME_REMAINING] - LockInitialLifetime()) / LockInitialLifetime() * fTrust; } -LOCK_STATUS_T CreateNewLockSample() +#if FFX_HALF +FFX_MIN16_F GetNormalizedRemainingLockLifetime(FFX_MIN16_F3 fLockStatus) { - LOCK_STATUS_T fLockStatus = LOCK_STATUS_T(0, 0, 0); + const FFX_MIN16_F fTrust = fLockStatus[LOCK_TRUST]; + const FFX_MIN16_F fInitialLockLifetime = FFX_MIN16_F(LockInitialLifetime()); - fLockStatus[LOCK_TRUST] = LOCK_STATUS_F1(1); + return ffxSaturate(fLockStatus[LOCK_LIFETIME_REMAINING] - fInitialLockLifetime) / fInitialLockLifetime * fTrust; +} +#endif - return fLockStatus; +void InitializeNewLockSample(FFX_PARAMETER_OUT FfxFloat32x3 fLockStatus) +{ + fLockStatus = FfxFloat32x3(0, 0, 1); // LOCK_TRUST to 1 } +#if FFX_HALF +void InitializeNewLockSample(FFX_PARAMETER_OUT FFX_MIN16_F3 fLockStatus) +{ + fLockStatus = FFX_MIN16_F3(0, 0, 1); // LOCK_TRUST to 1 +} +#endif + + +void KillLock(FFX_PARAMETER_INOUT FfxFloat32x3 fLockStatus) +{ + fLockStatus[LOCK_LIFETIME_REMAINING] = 0; +} + +#if FFX_HALF void KillLock(FFX_PARAMETER_INOUT FFX_MIN16_F3 fLockStatus) { fLockStatus[LOCK_LIFETIME_REMAINING] = FFX_MIN16_F(0); } - -#define SPLIT_LEFT 0 -#define SPLIT_RIGHT 1 -#ifndef SPLIT_SHADER -#define SPLIT_SHADER SPLIT_RIGHT #endif -#if FFX_HALF - -#define UPSAMPLE_F FfxFloat16 -#define UPSAMPLE_F2 FfxFloat16x2 -#define UPSAMPLE_F3 FfxFloat16x3 -#define UPSAMPLE_F4 FfxFloat16x4 -#define UPSAMPLE_I FfxInt16 -#define UPSAMPLE_I2 FfxInt16x2 -#define UPSAMPLE_I3 FfxInt16x3 -#define UPSAMPLE_I4 FfxInt16x4 -#define UPSAMPLE_U FfxUInt16 -#define UPSAMPLE_U2 FfxUInt16x2 -#define UPSAMPLE_U3 FfxUInt16x3 -#define UPSAMPLE_U4 FfxUInt16x4 -#define UPSAMPLE_F2_BROADCAST(X) FFX_BROADCAST_MIN_FLOAT16X2(X) -#define UPSAMPLE_F3_BROADCAST(X) FFX_BROADCAST_MIN_FLOAT16X3(X) -#define UPSAMPLE_F4_BROADCAST(X) FFX_BROADCAST_MIN_FLOAT16X4(X) -#define UPSAMPLE_I2_BROADCAST(X) FFX_BROADCAST_MIN_INT16X2(X) -#define UPSAMPLE_I3_BROADCAST(X) FFX_BROADCAST_MIN_INT16X3(X) -#define UPSAMPLE_I4_BROADCAST(X) FFX_BROADCAST_MIN_INT16X4(X) -#define UPSAMPLE_U2_BROADCAST(X) FFX_BROADCAST_MIN_UINT16X2(X) -#define UPSAMPLE_U3_BROADCAST(X) FFX_BROADCAST_MIN_UINT16X3(X) -#define UPSAMPLE_U4_BROADCAST(X) FFX_BROADCAST_MIN_UINT16X4(X) - -#else //FFX_HALF - -#define UPSAMPLE_F FfxFloat32 -#define UPSAMPLE_F2 FfxFloat32x2 -#define UPSAMPLE_F3 FfxFloat32x3 -#define UPSAMPLE_F4 FfxFloat32x4 -#define UPSAMPLE_I FfxInt32 -#define UPSAMPLE_I2 FfxInt32x2 -#define UPSAMPLE_I3 FfxInt32x3 -#define UPSAMPLE_I4 FfxInt32x4 -#define UPSAMPLE_U FfxUInt32 -#define UPSAMPLE_U2 FfxUInt32x2 -#define UPSAMPLE_U3 FfxUInt32x3 -#define UPSAMPLE_U4 FfxUInt32x4 -#define UPSAMPLE_F2_BROADCAST(X) FFX_BROADCAST_FLOAT32X2(X) -#define UPSAMPLE_F3_BROADCAST(X) FFX_BROADCAST_FLOAT32X3(X) -#define UPSAMPLE_F4_BROADCAST(X) FFX_BROADCAST_FLOAT32X4(X) -#define UPSAMPLE_I2_BROADCAST(X) FFX_BROADCAST_INT32X2(X) -#define UPSAMPLE_I3_BROADCAST(X) FFX_BROADCAST_INT32X3(X) -#define UPSAMPLE_I4_BROADCAST(X) FFX_BROADCAST_INT32X4(X) -#define UPSAMPLE_U2_BROADCAST(X) FFX_BROADCAST_UINT32X2(X) -#define UPSAMPLE_U3_BROADCAST(X) FFX_BROADCAST_UINT32X3(X) -#define UPSAMPLE_U4_BROADCAST(X) FFX_BROADCAST_UINT32X4(X) - -#endif //FFX_HALF - struct RectificationBoxData { - UPSAMPLE_F3 boxCenter; - UPSAMPLE_F3 boxVec; - UPSAMPLE_F3 aabbMin; - UPSAMPLE_F3 aabbMax; + FfxFloat32x3 boxCenter; + FfxFloat32x3 boxVec; + FfxFloat32x3 aabbMin; + FfxFloat32x3 aabbMax; }; +#if FFX_HALF +struct RectificationBoxDataMin16 +{ + FFX_MIN16_F3 boxCenter; + FFX_MIN16_F3 boxVec; + FFX_MIN16_F3 aabbMin; + FFX_MIN16_F3 aabbMax; +}; +#endif struct RectificationBox { RectificationBoxData data_; - UPSAMPLE_F fBoxCenterWeight; + FfxFloat32 fBoxCenterWeight; }; - -void RectificationBoxReset(FFX_PARAMETER_INOUT RectificationBox rectificationBox, const UPSAMPLE_F3 initialColorSample) +#if FFX_HALF +struct RectificationBoxMin16 { - rectificationBox.fBoxCenterWeight = UPSAMPLE_F(0.0); + RectificationBoxDataMin16 data_; + FFX_MIN16_F fBoxCenterWeight; +}; +#endif - rectificationBox.data_.boxCenter = UPSAMPLE_F3_BROADCAST(0); - rectificationBox.data_.boxVec = UPSAMPLE_F3_BROADCAST(0); +void RectificationBoxReset(FFX_PARAMETER_INOUT RectificationBox rectificationBox, const FfxFloat32x3 initialColorSample) +{ + rectificationBox.fBoxCenterWeight = FfxFloat32(0); + + rectificationBox.data_.boxCenter = FfxFloat32x3(0, 0, 0); + rectificationBox.data_.boxVec = FfxFloat32x3(0, 0, 0); rectificationBox.data_.aabbMin = initialColorSample; rectificationBox.data_.aabbMax = initialColorSample; } +#if FFX_HALF +void RectificationBoxReset(FFX_PARAMETER_INOUT RectificationBoxMin16 rectificationBox, const FFX_MIN16_F3 initialColorSample) +{ + rectificationBox.fBoxCenterWeight = FFX_MIN16_F(0); -void RectificationBoxAddSample(FFX_PARAMETER_INOUT RectificationBox rectificationBox, const UPSAMPLE_F3 colorSample, const UPSAMPLE_F fSampleWeight) + rectificationBox.data_.boxCenter = FFX_MIN16_F3(0, 0, 0); + rectificationBox.data_.boxVec = FFX_MIN16_F3(0, 0, 0); + rectificationBox.data_.aabbMin = initialColorSample; + rectificationBox.data_.aabbMax = initialColorSample; +} +#endif + +void RectificationBoxAddSample(FFX_PARAMETER_INOUT RectificationBox rectificationBox, const FfxFloat32x3 colorSample, const FfxFloat32 fSampleWeight) { rectificationBox.data_.aabbMin = ffxMin(rectificationBox.data_.aabbMin, colorSample); rectificationBox.data_.aabbMax = ffxMax(rectificationBox.data_.aabbMax, colorSample); - UPSAMPLE_F3 weightedSample = colorSample * fSampleWeight; + FfxFloat32x3 weightedSample = colorSample * fSampleWeight; rectificationBox.data_.boxCenter += weightedSample; rectificationBox.data_.boxVec += colorSample * weightedSample; rectificationBox.fBoxCenterWeight += fSampleWeight; } +#if FFX_HALF +void RectificationBoxAddSample(FFX_PARAMETER_INOUT RectificationBoxMin16 rectificationBox, const FFX_MIN16_F3 colorSample, const FFX_MIN16_F fSampleWeight) +{ + rectificationBox.data_.aabbMin = ffxMin(rectificationBox.data_.aabbMin, colorSample); + rectificationBox.data_.aabbMax = ffxMax(rectificationBox.data_.aabbMax, colorSample); + FFX_MIN16_F3 weightedSample = colorSample * fSampleWeight; + rectificationBox.data_.boxCenter += weightedSample; + rectificationBox.data_.boxVec += colorSample * weightedSample; + rectificationBox.fBoxCenterWeight += fSampleWeight; +} +#endif void RectificationBoxComputeVarianceBoxData(FFX_PARAMETER_INOUT RectificationBox rectificationBox) { - rectificationBox.fBoxCenterWeight = (abs(rectificationBox.fBoxCenterWeight) > UPSAMPLE_F(FSR2_EPSILON) ? rectificationBox.fBoxCenterWeight : UPSAMPLE_F(1.f)); + rectificationBox.fBoxCenterWeight = (abs(rectificationBox.fBoxCenterWeight) > FfxFloat32(FSR2_EPSILON) ? rectificationBox.fBoxCenterWeight : FfxFloat32(1.f)); rectificationBox.data_.boxCenter /= rectificationBox.fBoxCenterWeight; rectificationBox.data_.boxVec /= rectificationBox.fBoxCenterWeight; - UPSAMPLE_F3 stdDev = sqrt(abs(rectificationBox.data_.boxVec - rectificationBox.data_.boxCenter * rectificationBox.data_.boxCenter)); + FfxFloat32x3 stdDev = sqrt(abs(rectificationBox.data_.boxVec - rectificationBox.data_.boxCenter * rectificationBox.data_.boxCenter)); rectificationBox.data_.boxVec = stdDev; } +#if FFX_HALF +void RectificationBoxComputeVarianceBoxData(FFX_PARAMETER_INOUT RectificationBoxMin16 rectificationBox) +{ + rectificationBox.fBoxCenterWeight = (abs(rectificationBox.fBoxCenterWeight) > FFX_MIN16_F(FSR2_EPSILON) ? rectificationBox.fBoxCenterWeight : FFX_MIN16_F(1.f)); + rectificationBox.data_.boxCenter /= rectificationBox.fBoxCenterWeight; + rectificationBox.data_.boxVec /= rectificationBox.fBoxCenterWeight; + FFX_MIN16_F3 stdDev = sqrt(abs(rectificationBox.data_.boxVec - rectificationBox.data_.boxCenter * rectificationBox.data_.boxCenter)); + rectificationBox.data_.boxVec = stdDev; +} +#endif RectificationBoxData RectificationBoxGetData(FFX_PARAMETER_INOUT RectificationBox rectificationBox) { return rectificationBox.data_; } +#if FFX_HALF +RectificationBoxDataMin16 RectificationBoxGetData(FFX_PARAMETER_INOUT RectificationBoxMin16 rectificationBox) +{ + return rectificationBox.data_; +} +#endif FfxFloat32x3 SafeRcp3(FfxFloat32x3 v) { - return (all(FFX_NOT_EQUAL(v, FFX_BROADCAST_FLOAT32X3(0)))) ? (FFX_BROADCAST_FLOAT32X3(1.0f) / v) : FFX_BROADCAST_FLOAT32X3(0.0f); + return (all(FFX_NOT_EQUAL(v, FfxFloat32x3(0, 0, 0)))) ? (FfxFloat32x3(1, 1, 1) / v) : FfxFloat32x3(0, 0, 0); } +#if FFX_HALF +FFX_MIN16_F3 SafeRcp3(FFX_MIN16_F3 v) +{ + return (all(FFX_NOT_EQUAL(v, FFX_MIN16_F3(0, 0, 0)))) ? (FFX_MIN16_F3(1, 1, 1) / v) : FFX_MIN16_F3(0, 0, 0); +} +#endif FfxFloat32 MinDividedByMax(const FfxFloat32 v0, const FfxFloat32 v1) { @@ -202,49 +227,31 @@ FFX_MIN16_F MinDividedByMax(const FFX_MIN16_F v0, const FFX_MIN16_F v1) } #endif -FfxFloat32 MaxDividedByMin(const FfxFloat32 v0, const FfxFloat32 v1) -{ - const FfxFloat32 m = ffxMin(v0, v1); - return m != 0 ? ffxMax(v0, v1) / m : 0; -} - -FFX_MIN16_F3 RGBToYCoCg_16(FFX_MIN16_F3 fRgb) -{ - FFX_MIN16_F3 fYCoCg; - fYCoCg.x = dot(fRgb.rgb, FFX_MIN16_F3(+0.25f, +0.50f, +0.25f)); - fYCoCg.y = dot(fRgb.rgb, FFX_MIN16_F3(+0.50f, +0.00f, -0.50f)); - fYCoCg.z = dot(fRgb.rgb, FFX_MIN16_F3(-0.25f, +0.50f, -0.25f)); - return fYCoCg; -} - -FFX_MIN16_F3 RGBToYCoCg_V2_16(FFX_MIN16_F3 fRgb) -{ - FFX_MIN16_F a = fRgb.g * FFX_MIN16_F(0.5f); - FFX_MIN16_F b = (fRgb.r + fRgb.b) * FFX_MIN16_F(0.25f); - FFX_MIN16_F3 fYCoCg; - fYCoCg.x = a + b; - fYCoCg.y = (fRgb.r - fRgb.b) * FFX_MIN16_F(0.5f); - fYCoCg.z = a - b; - return fYCoCg; -} - FfxFloat32x3 YCoCgToRGB(FfxFloat32x3 fYCoCg) { FfxFloat32x3 fRgb; - FfxFloat32 tmp = fYCoCg.x - fYCoCg.z / 2.0; - fRgb.g = fYCoCg.z + tmp; - fRgb.b = tmp - fYCoCg.y / 2.0; - fRgb.r = fRgb.b + fYCoCg.y; + + fYCoCg.yz -= FfxFloat32x2(0.5f, 0.5f); // [0,1] -> [-0.5,0.5] + + fRgb = FfxFloat32x3( + fYCoCg.x + fYCoCg.y - fYCoCg.z, + fYCoCg.x + fYCoCg.z, + fYCoCg.x - fYCoCg.y - fYCoCg.z); + return fRgb; } #if FFX_HALF FFX_MIN16_F3 YCoCgToRGB(FFX_MIN16_F3 fYCoCg) { FFX_MIN16_F3 fRgb; - FFX_MIN16_F tmp = fYCoCg.x - fYCoCg.z * FFX_MIN16_F(0.5f); - fRgb.g = fYCoCg.z + tmp; - fRgb.b = tmp - fYCoCg.y * FFX_MIN16_F(0.5f); - fRgb.r = fRgb.b + fYCoCg.y; + + fYCoCg.yz -= FFX_MIN16_F2(0.5f, 0.5f); // [0,1] -> [-0.5,0.5] + + fRgb = FFX_MIN16_F3( + fYCoCg.x + fYCoCg.y - fYCoCg.z, + fYCoCg.x + fYCoCg.z, + fYCoCg.x - fYCoCg.y - fYCoCg.z); + return fRgb; } #endif @@ -252,39 +259,42 @@ FFX_MIN16_F3 YCoCgToRGB(FFX_MIN16_F3 fYCoCg) FfxFloat32x3 RGBToYCoCg(FfxFloat32x3 fRgb) { FfxFloat32x3 fYCoCg; - fYCoCg.y = fRgb.r - fRgb.b; - FfxFloat32 tmp = fRgb.b + fYCoCg.y / 2.0; - fYCoCg.z = fRgb.g - tmp; - fYCoCg.x = tmp + fYCoCg.z / 2.0; + + fYCoCg = FfxFloat32x3( + 0.25f * fRgb.r + 0.5f * fRgb.g + 0.25f * fRgb.b, + 0.5f * fRgb.r - 0.5f * fRgb.b, + -0.25f * fRgb.r + 0.5f * fRgb.g - 0.25f * fRgb.b); + + fYCoCg.yz += FfxFloat32x2(0.5f, 0.5f); // [-0.5,0.5] -> [0,1] + return fYCoCg; } #if FFX_HALF FFX_MIN16_F3 RGBToYCoCg(FFX_MIN16_F3 fRgb) { FFX_MIN16_F3 fYCoCg; - fYCoCg.y = fRgb.r - fRgb.b; - FFX_MIN16_F tmp = fRgb.b + fYCoCg.y * FFX_MIN16_F(0.5f); - fYCoCg.z = fRgb.g - tmp; - fYCoCg.x = tmp + fYCoCg.z * FFX_MIN16_F(0.5f); + + fYCoCg = FFX_MIN16_F3( + 0.25 * fRgb.r + 0.5 * fRgb.g + 0.25 * fRgb.b, + 0.5 * fRgb.r - 0.5 * fRgb.b, + -0.25 * fRgb.r + 0.5 * fRgb.g - 0.25 * fRgb.b); + + fYCoCg.yz += FFX_MIN16_F2(0.5, 0.5); // [-0.5,0.5] -> [0,1] + return fYCoCg; } #endif -FfxFloat32x3 RGBToYCoCg_V2(FfxFloat32x3 fRgb) -{ - FfxFloat32 a = fRgb.g * 0.5f; - FfxFloat32 b = (fRgb.r + fRgb.b) * 0.25f; - FfxFloat32x3 fYCoCg; - fYCoCg.x = a + b; - fYCoCg.y = (fRgb.r - fRgb.b) * 0.5f; - fYCoCg.z = a - b; - return fYCoCg; -} - FfxFloat32 RGBToLuma(FfxFloat32x3 fLinearRgb) { return dot(fLinearRgb, FfxFloat32x3(0.2126f, 0.7152f, 0.0722f)); } +#if FFX_HALF +FFX_MIN16_F RGBToLuma(FFX_MIN16_F3 fLinearRgb) +{ + return dot(fLinearRgb, FFX_MIN16_F3(0.2126f, 0.7152f, 0.0722f)); +} +#endif FfxFloat32 RGBToPerceivedLuma(FfxFloat32x3 fLinearRgb) { @@ -299,6 +309,22 @@ FfxFloat32 RGBToPerceivedLuma(FfxFloat32x3 fLinearRgb) return fPercievedLuminance * 0.01f; } +#if FFX_HALF +FFX_MIN16_F RGBToPerceivedLuma(FFX_MIN16_F3 fLinearRgb) +{ + FFX_MIN16_F fLuminance = RGBToLuma(fLinearRgb); + + FFX_MIN16_F fPercievedLuminance = FFX_MIN16_F(0); + if (fLuminance <= FFX_MIN16_F(216.0f / 24389.0f)) { + fPercievedLuminance = fLuminance * FFX_MIN16_F(24389.0f / 27.0f); + } + else { + fPercievedLuminance = ffxPow(fLuminance, FFX_MIN16_F(1.0f / 3.0f)) * FFX_MIN16_F(116.0f) - FFX_MIN16_F(16.0f); + } + + return fPercievedLuminance * FFX_MIN16_F(0.01f); +} +#endif FfxFloat32x3 Tonemap(FfxFloat32x3 fRgb) @@ -321,22 +347,29 @@ FFX_MIN16_F3 InverseTonemap(FFX_MIN16_F3 fRgb) { return fRgb / ffxMax(FFX_MIN16_F(FSR2_TONEMAP_EPSILON), FFX_MIN16_F(1.f) - ffxMax(fRgb.r, ffxMax(fRgb.g, fRgb.b))).xxx; } - -FFX_MIN16_I2 ClampLoad(FFX_MIN16_I2 iPxSample, FFX_MIN16_I2 iPxOffset, FFX_MIN16_I2 iTextureSize) -{ - return clamp(iPxSample + iPxOffset, FFX_MIN16_I2(0, 0), iTextureSize - FFX_MIN16_I2(1, 1)); -} #endif FfxInt32x2 ClampLoad(FfxInt32x2 iPxSample, FfxInt32x2 iPxOffset, FfxInt32x2 iTextureSize) { return clamp(iPxSample + iPxOffset, FfxInt32x2(0, 0), iTextureSize - FfxInt32x2(1, 1)); } +#if FFX_HALF +FFX_MIN16_I2 ClampLoad(FFX_MIN16_I2 iPxSample, FFX_MIN16_I2 iPxOffset, FFX_MIN16_I2 iTextureSize) +{ + return clamp(iPxSample + iPxOffset, FFX_MIN16_I2(0, 0), iTextureSize - FFX_MIN16_I2(1, 1)); +} +#endif +FfxBoolean IsOnScreen(FfxInt32x2 pos, FfxInt32x2 size) +{ + return all(FFX_GREATER_THAN_EQUAL(pos, FfxInt32x2(0, 0))) && all(FFX_LESS_THAN(pos, size)); +} +#if FFX_HALF FfxBoolean IsOnScreen(FFX_MIN16_I2 pos, FFX_MIN16_I2 size) { - return all(FFX_GREATER_THAN_EQUAL(pos, FFX_BROADCAST_MIN_FLOAT16X2(0))) && all(FFX_LESS_THAN(pos, size)); + return all(FFX_GREATER_THAN_EQUAL(pos, FFX_MIN16_I2(0, 0))) && all(FFX_LESS_THAN(pos, size)); } +#endif FfxFloat32 ComputeAutoExposureFromLavg(FfxFloat32 Lavg) { @@ -351,6 +384,39 @@ FfxFloat32 ComputeAutoExposureFromLavg(FfxFloat32 Lavg) return 1 / Lmax; } +#if FFX_HALF +FFX_MIN16_F ComputeAutoExposureFromLavg(FFX_MIN16_F Lavg) +{ + Lavg = exp(Lavg); + + const FFX_MIN16_F S = FFX_MIN16_F(100.0f); //ISO arithmetic speed + const FFX_MIN16_F K = FFX_MIN16_F(12.5f); + const FFX_MIN16_F ExposureISO100 = log2((Lavg * S) / K); + + const FFX_MIN16_F q = FFX_MIN16_F(0.65f); + const FFX_MIN16_F Lmax = (FFX_MIN16_F(78.0f) / (q * S)) * ffxPow(FFX_MIN16_F(2.0f), ExposureISO100); + + return FFX_MIN16_F(1) / Lmax; +} +#endif + +FfxInt32x2 ComputeHrPosFromLrPos(FfxInt32x2 iPxLrPos) +{ + FfxFloat32x2 fSrcJitteredPos = FfxFloat32x2(iPxLrPos) + 0.5f - Jitter(); + FfxFloat32x2 fLrPosInHr = (fSrcJitteredPos / RenderSize()) * DisplaySize(); + FfxFloat32x2 fHrPos = floor(fLrPosInHr) + 0.5f; + return FfxInt32x2(fHrPos); +} +#if FFX_HALF +FFX_MIN16_I2 ComputeHrPosFromLrPos(FFX_MIN16_I2 iPxLrPos) +{ + FFX_MIN16_F2 fSrcJitteredPos = FFX_MIN16_F2(iPxLrPos) + FFX_MIN16_F(0.5f) - FFX_MIN16_F2(Jitter()); + FFX_MIN16_F2 fLrPosInHr = (fSrcJitteredPos / FFX_MIN16_F2(RenderSize())) * FFX_MIN16_F2(DisplaySize()); + FFX_MIN16_F2 fHrPos = floor(fLrPosInHr) + FFX_MIN16_F(0.5); + return FFX_MIN16_I2(fHrPos); +} +#endif + #endif // #if defined(FFX_GPU) -#endif //!defined(FFX_FSR2_COMMON_H) \ No newline at end of file +#endif //!defined(FFX_FSR2_COMMON_H) diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl index 49fe0a8..9a6a329 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_compute_luminance_pyramid_pass.glsl @@ -168,4 +168,4 @@ FFX_FSR2_NUM_THREADS void main() { ComputeAutoExposure(gl_WorkGroupID.xyz, gl_LocalInvocationIndex); -} +} \ No newline at end of file diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_compute_luminance_pyramid_pass.hlsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_compute_luminance_pyramid_pass.hlsl index 8400af2..07a097a 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_compute_luminance_pyramid_pass.hlsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_compute_luminance_pyramid_pass.hlsl @@ -97,7 +97,7 @@ void SPD_SetExposureBuffer(float2 value) #endif } -float4 SPD_LoadMipmap5(min16int2 iPxPos) +float4 SPD_LoadMipmap5(int2 iPxPos) { #if defined(FSR2_BIND_UAV_EXPOSURE_MIP_5) || defined(FFX_INTERNAL) return float4(rw_img_mip_5[iPxPos], 0, 0, 0); @@ -106,7 +106,7 @@ float4 SPD_LoadMipmap5(min16int2 iPxPos) #endif } -void SPD_SetMipmap(min16int2 iPxPos, int slice, float value) +void SPD_SetMipmap(int2 iPxPos, int slice, float value) { switch (slice) { diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_depth_clip.h b/src/ffx-fsr2-api/shaders/ffx_fsr2_depth_clip.h index b44cc59..81db737 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_depth_clip.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_depth_clip.h @@ -19,9 +19,12 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. +#ifndef FFX_FSR2_DEPTH_CLIP_H +#define FFX_FSR2_DEPTH_CLIP_H + FFX_STATIC const FfxFloat32 DepthClipBaseScale = 4.0f; -FfxFloat32 ComputeSampleDepthClip(FFX_MIN16_I2 iPxSamplePos, FfxFloat32 fPreviousDepth, FfxFloat32 fPreviousDepthBilinearWeight, FfxFloat32 fCurrentDepthViewSpace) +FfxFloat32 ComputeSampleDepthClip(FfxInt32x2 iPxSamplePos, FfxFloat32 fPreviousDepth, FfxFloat32 fPreviousDepthBilinearWeight, FfxFloat32 fCurrentDepthViewSpace) { FfxFloat32 fPrevNearestDepthViewSpace = abs(ConvertFromDeviceDepthToViewSpace(fPreviousDepth)); @@ -42,13 +45,13 @@ FfxFloat32 ComputeSampleDepthClip(FFX_MIN16_I2 iPxSamplePos, FfxFloat32 fPreviou rw_debug_out[iPxSamplePos] = FfxFloat32x4(fCurrentDepthViewSpace, fPrevNearestDepthViewSpace, fDepthDiff, fDepthClipFactor); #endif - return fPreviousDepthBilinearWeight * fDepthClipFactor * DepthClipBaseScale; + return fPreviousDepthBilinearWeight * fDepthClipFactor * ffxLerp(1.0f, DepthClipBaseScale, ffxSaturate(fDepthDiff * fDepthDiff)); } FfxFloat32 ComputeDepthClip(FfxFloat32x2 fUvSample, FfxFloat32 fCurrentDepthViewSpace) { FfxFloat32x2 fPxSample = fUvSample * RenderSize() - 0.5f; - FFX_MIN16_I2 iPxSample = FFX_MIN16_I2(floor(fPxSample)); + FfxInt32x2 iPxSample = FfxInt32x2(floor(fPxSample)); FfxFloat32x2 fPxFrac = ffxFract(fPxSample); const FfxFloat32 fBilinearWeights[2][2] = { @@ -66,8 +69,8 @@ FfxFloat32 ComputeDepthClip(FfxFloat32x2 fUvSample, FfxFloat32 fCurrentDepthView FfxFloat32 fWeightSum = 0.0f; for (FfxInt32 y = 0; y <= 1; ++y) { for (FfxInt32 x = 0; x <= 1; ++x) { - FFX_MIN16_I2 iSamplePos = iPxSample + FFX_MIN16_I2(x, y); - if (IsOnScreen(iSamplePos, FFX_MIN16_I2(RenderSize()))) { + FfxInt32x2 iSamplePos = iPxSample + FfxInt32x2(x, y); + if (IsOnScreen(iSamplePos, RenderSize())) { FfxFloat32 fBilinearWeight = fBilinearWeights[y][x]; if (fBilinearWeight > reconstructedDepthBilinearWeightThreshold) { fDepth += ComputeSampleDepthClip(iSamplePos, LoadReconstructedPrevDepth(iSamplePos), fBilinearWeight, fCurrentDepthViewSpace); @@ -80,9 +83,9 @@ FfxFloat32 ComputeDepthClip(FfxFloat32x2 fUvSample, FfxFloat32 fCurrentDepthView return (fWeightSum > 0) ? fDepth / fWeightSum : DepthClipBaseScale; } -void DepthClip(FFX_MIN16_I2 iPxPos) +void DepthClip(FfxInt32x2 iPxPos) { - FfxFloat32x2 fDepthUv = (FfxFloat32x2(iPxPos) + 0.5f) / RenderSize(); + FfxFloat32x2 fDepthUv = (iPxPos + 0.5f) / RenderSize(); FfxFloat32x2 fMotionVector = LoadDilatedMotionVector(iPxPos); FfxFloat32x2 fDilatedUv = fDepthUv + fMotionVector; FfxFloat32 fCurrentDepthViewSpace = abs(ConvertFromDeviceDepthToViewSpace(LoadDilatedDepth(iPxPos))); @@ -91,3 +94,5 @@ void DepthClip(FFX_MIN16_I2 iPxPos) StoreDepthClip(iPxPos, fDepthClip); } + +#endif //!defined( FFX_FSR2_DEPTH_CLIPH ) \ No newline at end of file diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_depth_clip_pass.glsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_depth_clip_pass.glsl index 96218b8..7233ec6 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_depth_clip_pass.glsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_depth_clip_pass.glsl @@ -20,7 +20,7 @@ // THE SOFTWARE. // FSR2 pass 3 -// SRV 7 : FSR2_ReconstructedPrevNearestDepth : r_ReconstructedPrevNearestDepth +// SRV 7 : FSR2_ReconstructedPrevNearestDepth : r_reconstructed_previous_nearest_depth // SRV 8 : FSR2_DilatedVelocity : r_dilated_motion_vectors // SRV 9 : FSR2_DilatedDepth : r_dilatedDepth // UAV 12 : FSR2_DepthClip : rw_depth_clip @@ -58,5 +58,5 @@ FFX_FSR2_NUM_THREADS void main() { - DepthClip(FFX_MIN16_I2(gl_GlobalInvocationID.xy)); + DepthClip(ivec2(gl_GlobalInvocationID.xy)); } diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_depth_clip_pass.hlsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_depth_clip_pass.hlsl index d95d2a7..8433734 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_depth_clip_pass.hlsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_depth_clip_pass.hlsl @@ -20,7 +20,7 @@ // THE SOFTWARE. // FSR2 pass 3 -// SRV 7 : FSR2_ReconstructedPrevNearestDepth : r_ReconstructedPrevNearestDepth +// SRV 7 : FSR2_ReconstructedPrevNearestDepth : r_reconstructed_previous_nearest_depth // SRV 8 : FSR2_DilatedVelocity : r_dilated_motion_vectors // SRV 9 : FSR2_DilatedDepth : r_dilatedDepth // UAV 12 : FSR2_DepthClip : rw_depth_clip @@ -54,9 +54,9 @@ FFX_FSR2_PREFER_WAVE64 FFX_FSR2_NUM_THREADS FFX_FSR2_EMBED_ROOTSIG_CONTENT void CS( - min16int2 iGroupId : SV_GroupID, - min16int2 iDispatchThreadId : SV_DispatchThreadID, - min16int2 iGroupThreadId : SV_GroupThreadID, + int2 iGroupId : SV_GroupID, + int2 iDispatchThreadId : SV_DispatchThreadID, + int2 iGroupThreadId : SV_GroupThreadID, int iGroupIndex : SV_GroupIndex) { DepthClip(iDispatchThreadId); diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_lock.h b/src/ffx-fsr2-api/shaders/ffx_fsr2_lock.h index 24323df..b2266b7 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_lock.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_lock.h @@ -19,13 +19,16 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -FfxFloat32 GetLuma(FFX_MIN16_I2 pos) +#ifndef FFX_FSR2_LOCK_H +#define FFX_FSR2_LOCK_H + +FfxFloat32 GetLuma(FfxInt32x2 pos) { //add some bias to avoid locking dark areas return FfxFloat32(LoadPreparedInputColorLuma(pos)); } -FfxFloat32 ComputeThinFeatureConfidence(FFX_MIN16_I2 pos) +FfxFloat32 ComputeThinFeatureConfidence(FfxInt32x2 pos) { const FfxInt32 RADIUS = 1; @@ -59,7 +62,7 @@ FfxFloat32 ComputeThinFeatureConfidence(FFX_MIN16_I2 pos) for (FfxInt32 x = -RADIUS; x <= RADIUS; x++, idx++) { if (x == 0 && y == 0) continue; - FFX_MIN16_I2 samplePos = ClampLoad(pos, FFX_MIN16_I2(x, y), FFX_MIN16_I2(RenderSize())); + FfxInt32x2 samplePos = ClampLoad(pos, FfxInt32x2(x, y), FfxInt32x2(RenderSize())); FfxFloat32 sampleLuma = GetLuma(samplePos); FfxFloat32 difference = ffxMax(sampleLuma, fNucleus) / ffxMin(sampleLuma, fNucleus); @@ -93,7 +96,7 @@ FfxFloat32 ComputeThinFeatureConfidence(FFX_MIN16_I2 pos) FFX_STATIC FfxBoolean s_bLockUpdated = FFX_FALSE; -LOCK_STATUS_T ComputeLockStatus(FFX_MIN16_I2 iPxLrPos, LOCK_STATUS_T fLockStatus) +FfxFloat32x3 ComputeLockStatus(FfxInt32x2 iPxLrPos, FfxFloat32x3 fLockStatus) { FfxFloat32 fConfidenceOfThinFeature = ComputeThinFeatureConfidence(iPxLrPos); @@ -101,7 +104,7 @@ LOCK_STATUS_T ComputeLockStatus(FFX_MIN16_I2 iPxLrPos, LOCK_STATUS_T fLockStatus if (fConfidenceOfThinFeature > 0.0f) { //put to negative on new lock - fLockStatus[LOCK_LIFETIME_REMAINING] = (fLockStatus[LOCK_LIFETIME_REMAINING] == LOCK_STATUS_F1(0.0f)) ? LOCK_STATUS_F1(-LockInitialLifetime()) : LOCK_STATUS_F1(-(LockInitialLifetime() * 2)); + fLockStatus[LOCK_LIFETIME_REMAINING] = (fLockStatus[LOCK_LIFETIME_REMAINING] == FfxFloat32(0.0f)) ? FfxFloat32(-LockInitialLifetime()) : FfxFloat32(-(LockInitialLifetime() * 2)); s_bLockUpdated = FFX_TRUE; } @@ -109,63 +112,15 @@ LOCK_STATUS_T ComputeLockStatus(FFX_MIN16_I2 iPxLrPos, LOCK_STATUS_T fLockStatus return fLockStatus; } -void ComputeLock(FFX_MIN16_I2 iPxLrPos) +void ComputeLock(FfxInt32x2 iPxLrPos) { - FfxFloat32x2 fSrcJitteredPos = FfxFloat32x2(iPxLrPos) + 0.5f - Jitter(); - FfxFloat32x2 fLrPosInHr = (fSrcJitteredPos / RenderSize()) * DisplaySize(); - FfxFloat32x2 fHrPos = floor(fLrPosInHr) + 0.5; - FFX_MIN16_I2 iPxHrPos = FFX_MIN16_I2(fHrPos); + FfxInt32x2 iPxHrPos = ComputeHrPosFromLrPos(iPxLrPos); - LOCK_STATUS_T fLockStatus = ComputeLockStatus(iPxLrPos, LoadLockStatus(iPxHrPos)); + FfxFloat32x3 fLockStatus = ComputeLockStatus(iPxLrPos, LoadLockStatus(iPxHrPos)); if ((s_bLockUpdated)) { StoreLockStatus(iPxHrPos, fLockStatus); } } -FFX_GROUPSHARED FfxFloat32 gs_ReactiveMask[(8 + 4) * (8 + 4)]; - -void StoreReactiveMaskToLDS(FfxUInt32x2 coord, FfxFloat32x2 value) -{ - FfxUInt32 baseIdx = coord.y * 12 + coord.x; - gs_ReactiveMask[baseIdx] = value.x; - gs_ReactiveMask[baseIdx + 1] = value.y; -} - -FfxFloat32 LoadReactiveMaskFromLDS(FfxUInt32x2 coord) -{ - return gs_ReactiveMask[coord.y * 12 + coord.x]; -} - -void PreProcessReactiveMask(FFX_MIN16_I2 iPxLrPos, FfxUInt32x2 groupId, FfxUInt32x2 groupThreadId) -{ -#if OPT_PRECOMPUTE_REACTIVE_MAX && !OPT_USE_EVAL_ACCUMULATION_REACTIVENESS - - if (all(FFX_LESS_THAN(groupThreadId, FFX_BROADCAST_UINT32X2(6)))) { - - FfxInt32x2 iPos = FfxInt32x2(groupId << 3) + FfxInt32x2(groupThreadId << 1) - 1; - FfxFloat32x4 fReactiveMask2x2 = GatherReactiveMask(iPos).wzxy; - - StoreReactiveMaskToLDS(groupThreadId << 1, fReactiveMask2x2.xy); - StoreReactiveMaskToLDS((groupThreadId << 1) + FfxInt32x2(0, 1), fReactiveMask2x2.zw); - } - - FFX_GROUP_MEMORY_BARRIER(); - - FfxFloat32 fReactiveMax = 0.0f; - - for (FfxUInt32 row = 0; row < 4; row++) { - for (FfxUInt32 col = 0; col < 4; col++) { - const FfxUInt32x2 localOffset = groupThreadId + FfxUInt32x2(col, row); - const FfxBoolean bOutOfRenderBounds = any(FFX_GREATER_THAN_EQUAL((FfxInt32x2(groupId << 3) + FfxInt32x2(localOffset)), RenderSize())); - fReactiveMax = bOutOfRenderBounds ? fReactiveMax : ffxMax(fReactiveMax, LoadReactiveMaskFromLDS(localOffset)); - } - } - - // Threshold reactive value - fReactiveMax = fReactiveMax > 0.8f ? fReactiveMax : 0.0f; - - StoreReactiveMax(iPxLrPos, FFX_MIN16_F(fReactiveMax)); - -#endif //OPT_PRECOMPUTE_REACTIVE_MAX && !OPT_USE_EVAL_ACCUMULATION_REACTIVENESS -} +#endif // FFX_FSR2_LOCK_H diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_lock_pass.glsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_lock_pass.glsl index 1405f48..9c37774 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_lock_pass.glsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_lock_pass.glsl @@ -33,12 +33,10 @@ #extension GL_GOOGLE_include_directive : require #extension GL_EXT_samplerless_texture_functions : require -#define FSR2_BIND_SRV_REACTIVE_MASK 0 -#define FSR2_BIND_SRV_LOCK_STATUS 1 -#define FSR2_BIND_SRV_PREPARED_INPUT_COLOR 2 -#define FSR2_BIND_UAV_LOCK_STATUS 3 -#define FSR2_BIND_UAV_REACTIVE_MASK_MAX 4 -#define FSR2_BIND_CB_FSR2 5 +#define FSR2_BIND_SRV_LOCK_STATUS 0 +#define FSR2_BIND_SRV_PREPARED_INPUT_COLOR 1 +#define FSR2_BIND_UAV_LOCK_STATUS 2 +#define FSR2_BIND_CB_FSR2 3 #include "ffx_fsr2_callbacks_glsl.h" #include "ffx_fsr2_common.h" @@ -61,9 +59,7 @@ FFX_FSR2_NUM_THREADS void main() { - uvec2 uDispatchThreadId = gl_WorkGroupID.xy * uvec2(FFX_FSR2_THREAD_GROUP_WIDTH, FFX_FSR2_THREAD_GROUP_HEIGHT) + gl_LocalInvocationID.xy; + uvec2 uDispatchThreadId = gl_WorkGroupID.xy * uvec2(FFX_FSR2_THREAD_GROUP_WIDTH, FFX_FSR2_THREAD_GROUP_HEIGHT) + gl_LocalInvocationID.xy; - ComputeLock(FFX_MIN16_I2(uDispatchThreadId)); - - PreProcessReactiveMask(FFX_MIN16_I2(uDispatchThreadId), gl_WorkGroupID.xy, gl_LocalInvocationID.xy); + ComputeLock(ivec2(uDispatchThreadId)); } diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_lock_pass.hlsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_lock_pass.hlsl index 2f9c20e..492965c 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_lock_pass.hlsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_lock_pass.hlsl @@ -24,15 +24,11 @@ // SRV 11 : FSR2_LockStatus2 : r_lock_status // SRV 13 : FSR2_PreparedInputColor : r_prepared_input_color // UAV 11 : FSR2_LockStatus1 : rw_lock_status -// UAV 27 : FSR2_ReactiveMaskMax : rw_reactive_max // CB 0 : cbFSR2 -// CB 1 : FSR2DispatchOffsets -#define FSR2_BIND_SRV_REACTIVE_MASK 0 #define FSR2_BIND_SRV_LOCK_STATUS 1 #define FSR2_BIND_SRV_PREPARED_INPUT_COLOR 2 #define FSR2_BIND_UAV_LOCK_STATUS 0 -#define FSR2_BIND_UAV_REACTIVE_MASK_MAX 1 #define FSR2_BIND_CB_FSR2 0 #include "ffx_fsr2_callbacks_hlsl.h" @@ -61,6 +57,4 @@ void CS(uint2 uGroupId : SV_GroupID, uint2 uGroupThreadId : SV_GroupThreadID) uint2 uDispatchThreadId = uGroupId * uint2(FFX_FSR2_THREAD_GROUP_WIDTH, FFX_FSR2_THREAD_GROUP_HEIGHT) + uGroupThreadId; ComputeLock(uDispatchThreadId); - - PreProcessReactiveMask(uDispatchThreadId, uGroupId, uGroupThreadId); } diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_postprocess_lock_status.h b/src/ffx-fsr2-api/shaders/ffx_fsr2_postprocess_lock_status.h index 06c5495..959031b 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_postprocess_lock_status.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_postprocess_lock_status.h @@ -22,57 +22,70 @@ #ifndef FFX_FSR2_POSTPROCESS_LOCK_STATUS_H #define FFX_FSR2_POSTPROCESS_LOCK_STATUS_H +FfxFloat32x4 WrapShadingChangeLuma(FfxInt32x2 iPxSample) +{ + return FfxFloat32x4(LoadMipLuma(iPxSample, LumaMipLevelToUse()), 0, 0, 0); +} + +#if FFX_HALF FFX_MIN16_F4 WrapShadingChangeLuma(FFX_MIN16_I2 iPxSample) { return FFX_MIN16_F4(LoadMipLuma(iPxSample, LumaMipLevelToUse()), 0, 0, 0); } +#endif +#if FFX_FSR2_OPTION_POSTPROCESSLOCKSTATUS_SAMPLERS_USE_DATA_HALF && FFX_HALF +DeclareCustomFetchBilinearSamplesMin16(FetchShadingChangeLumaSamples, WrapShadingChangeLuma) +#else DeclareCustomFetchBilinearSamples(FetchShadingChangeLumaSamples, WrapShadingChangeLuma) +#endif DeclareCustomTextureSample(ShadingChangeLumaSample, Bilinear, FetchShadingChangeLumaSamples) -FFX_MIN16_F GetShadingChangeLuma(FfxFloat32x2 fUvCoord) +FfxFloat32 GetShadingChangeLuma(FfxFloat32x2 fUvCoord) { // const FfxFloat32 fShadingChangeLuma = exp(ShadingChangeLumaSample(fUvCoord, LumaMipDimensions()) * LumaMipRcp()); - const FFX_MIN16_F fShadingChangeLuma = FFX_MIN16_F(exp(SampleMipLuma(fUvCoord, LumaMipLevelToUse()) * FFX_MIN16_F(LumaMipRcp()))); + const FfxFloat32 fShadingChangeLuma = FfxFloat32(exp(SampleMipLuma(fUvCoord, LumaMipLevelToUse()) * FfxFloat32(LumaMipRcp()))); return fShadingChangeLuma; } -LockState GetLockState(LOCK_STATUS_T fLockStatus) +LockState GetLockState(FfxFloat32x3 fLockStatus) { LockState state = { FFX_FALSE, FFX_FALSE }; //Check if this is a new or refreshed lock - state.NewLock = fLockStatus[LOCK_LIFETIME_REMAINING] < LOCK_STATUS_F1(0.0f); + state.NewLock = fLockStatus[LOCK_LIFETIME_REMAINING] < FfxFloat32(0.0f); //For a non-refreshed lock, the lifetime is set to LockInitialLifetime() - state.WasLockedPrevFrame = fLockStatus[LOCK_TRUST] != LOCK_STATUS_F1(0.0f); + state.WasLockedPrevFrame = fLockStatus[LOCK_TRUST] != FfxFloat32(0.0f); return state; } -LockState PostProcessLockStatus(FFX_MIN16_I2 iPxHrPos, FFX_PARAMETER_IN FfxFloat32x2 fLrUvJittered, FFX_PARAMETER_IN FFX_MIN16_F fDepthClipFactor, FFX_PARAMETER_IN FfxFloat32 fHrVelocity, - FFX_PARAMETER_INOUT FfxFloat32 fAccumulationTotalWeight, FFX_PARAMETER_INOUT LOCK_STATUS_T fLockStatus, FFX_PARAMETER_OUT FFX_MIN16_F fLuminanceDiff) { +LockState PostProcessLockStatus(FfxInt32x2 iPxHrPos, FFX_PARAMETER_IN FfxFloat32x2 fLrUvJittered, FFX_PARAMETER_IN FfxFloat32 fDepthClipFactor, const FfxFloat32 fAccumulationMask, FFX_PARAMETER_IN FfxFloat32 fHrVelocity, + FFX_PARAMETER_INOUT FfxFloat32 fAccumulationTotalWeight, FFX_PARAMETER_INOUT FfxFloat32x3 fLockStatus, FFX_PARAMETER_OUT FfxFloat32 fLuminanceDiff) { const LockState state = GetLockState(fLockStatus); fLockStatus[LOCK_LIFETIME_REMAINING] = abs(fLockStatus[LOCK_LIFETIME_REMAINING]); - FFX_MIN16_F fShadingChangeLuma = GetShadingChangeLuma(fLrUvJittered); + FfxFloat32 fShadingChangeLuma = GetShadingChangeLuma(fLrUvJittered); //init temporal shading change factor, init to -1 or so in reproject to know if "true new"? - fLockStatus[LOCK_TEMPORAL_LUMA] = (fLockStatus[LOCK_TEMPORAL_LUMA] == LOCK_STATUS_F1(0.0f)) ? fShadingChangeLuma : fLockStatus[LOCK_TEMPORAL_LUMA]; + fLockStatus[LOCK_TEMPORAL_LUMA] = (fLockStatus[LOCK_TEMPORAL_LUMA] == FfxFloat32(0.0f)) ? fShadingChangeLuma : fLockStatus[LOCK_TEMPORAL_LUMA]; - FFX_MIN16_F fPreviousShadingChangeLuma = fLockStatus[LOCK_TEMPORAL_LUMA]; - fLockStatus[LOCK_TEMPORAL_LUMA] = ffxLerp(fLockStatus[LOCK_TEMPORAL_LUMA], LOCK_STATUS_F1(fShadingChangeLuma), LOCK_STATUS_F1(0.5f)); - fLuminanceDiff = FFX_MIN16_F(1) - MinDividedByMax(fPreviousShadingChangeLuma, fShadingChangeLuma); + FfxFloat32 fPreviousShadingChangeLuma = fLockStatus[LOCK_TEMPORAL_LUMA]; + fLockStatus[LOCK_TEMPORAL_LUMA] = ffxLerp(fLockStatus[LOCK_TEMPORAL_LUMA], FfxFloat32(fShadingChangeLuma), FfxFloat32(0.5f)); + fLuminanceDiff = FfxFloat32(1) - MinDividedByMax(fPreviousShadingChangeLuma, fShadingChangeLuma); - if (fLuminanceDiff > FFX_MIN16_F(0.2f)) { + if (fLuminanceDiff > FfxFloat32(0.2f)) { KillLock(fLockStatus); } - if (!state.NewLock && fLockStatus[LOCK_LIFETIME_REMAINING] >= LOCK_STATUS_F1(0)) + if (!state.NewLock && fLockStatus[LOCK_LIFETIME_REMAINING] >= FfxFloat32(0)) { - const FFX_MIN16_F depthClipThreshold = FFX_MIN16_F(0.99f); + fLockStatus[LOCK_LIFETIME_REMAINING] *= (1.0f - fAccumulationMask); + + const FfxFloat32 depthClipThreshold = FfxFloat32(0.99f); if (fDepthClipFactor < depthClipThreshold) { KillLock(fLockStatus); diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_prepare_input_color.h b/src/ffx-fsr2-api/shaders/ffx_fsr2_prepare_input_color.h index b9772b6..a773cda 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_prepare_input_color.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_prepare_input_color.h @@ -19,10 +19,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. +#ifndef FFX_FSR2_PREPARE_INPUT_COLOR_H +#define FFX_FSR2_PREPARE_INPUT_COLOR_H + //TODO: Move to common location & share with Accumulate -void ClearResourcesForNextFrame(in FFX_MIN16_I2 iPxHrPos) +void ClearResourcesForNextFrame(in FfxInt32x2 iPxHrPos) { - if (all(FFX_LESS_THAN(iPxHrPos, FFX_MIN16_I2(RenderSize())))) + if (all(FFX_LESS_THAN(iPxHrPos, FfxInt32x2(RenderSize())))) { #if FFX_FSR2_OPTION_INVERTED_DEPTH const FfxUInt32 farZ = 0x0; @@ -33,7 +36,7 @@ void ClearResourcesForNextFrame(in FFX_MIN16_I2 iPxHrPos) } } -void ComputeLumaStabilityFactor(FFX_MIN16_I2 iPxLrPos, FfxFloat32 fCurrentFrameLuma) +void ComputeLumaStabilityFactor(FfxInt32x2 iPxLrPos, FfxFloat32 fCurrentFrameLuma) { FfxFloat32x4 fCurrentFrameLumaHistory = LoadRwLumaHistory(iPxLrPos); @@ -54,12 +57,12 @@ void ComputeLumaStabilityFactor(FFX_MIN16_I2 iPxLrPos, FfxFloat32 fCurrentFrameL StoreLumaHistory(iPxLrPos, fCurrentFrameLumaHistory); } -void PrepareInputColor(FFX_MIN16_I2 iPxLrPos) +void PrepareInputColor(FfxInt32x2 iPxLrPos) { //We assume linear data. if non-linear input (sRGB, ...), //then we should convert to linear first and back to sRGB on output. - FfxFloat32x3 fRgb = ffxMax(FFX_BROADCAST_FLOAT32X3(0), LoadInputColor(iPxLrPos)); + FfxFloat32x3 fRgb = ffxMax(FfxFloat32x3(0, 0, 0), LoadInputColor(iPxLrPos)); fRgb *= Exposure(); @@ -68,16 +71,18 @@ void PrepareInputColor(FFX_MIN16_I2 iPxLrPos) fRgb = Tonemap(fRgb); #endif - PREPARED_INPUT_COLOR_T fYCoCg; + FfxFloat32x4 fYCoCg; - fYCoCg.xyz = PREPARED_INPUT_COLOR_F3(RGBToYCoCg(fRgb)); + fYCoCg.xyz = RGBToYCoCg(fRgb); const FfxFloat32 fPerceivedLuma = RGBToPerceivedLuma(fRgb); ComputeLumaStabilityFactor(iPxLrPos, fPerceivedLuma); //compute luma used to lock pixels, if used elsewhere the ffxPow must be moved! - fYCoCg.w = PREPARED_INPUT_COLOR_F1(ffxPow(fPerceivedLuma, 1.0f / 6.0f)); + fYCoCg.w = ffxPow(fPerceivedLuma, FfxFloat32(1.0 / 6.0)); StorePreparedInputColor(iPxLrPos, fYCoCg); ClearResourcesForNextFrame(iPxLrPos); } + +#endif // FFX_FSR2_PREPARE_INPUT_COLOR_H diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_prepare_input_color_pass.glsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_prepare_input_color_pass.glsl index ed10dc1..d37e0af 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_prepare_input_color_pass.glsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_prepare_input_color_pass.glsl @@ -18,11 +18,10 @@ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. - // FSR2 pass 1 // SRV 1 : m_HDR : r_input_color_jittered // SRV 4 : FSR2_Exposure : r_exposure -// UAV 7 : FSR2_ReconstructedPrevNearestDepth : rw_ReconstructedPrevNearestDepth +// UAV 7 : FSR2_ReconstructedPrevNearestDepth : rw_reconstructed_previous_nearest_depth // UAV 13 : FSR2_PreparedInputColor : rw_prepared_input_color // UAV 14 : FSR2_LumaHistory : rw_luma_history // CB 0 : cbFSR2 @@ -59,5 +58,5 @@ FFX_FSR2_NUM_THREADS void main() { - PrepareInputColor(FFX_MIN16_I2(gl_GlobalInvocationID.xy)); + PrepareInputColor(ivec2(gl_GlobalInvocationID.xy)); } diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_prepare_input_color_pass.hlsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_prepare_input_color_pass.hlsl index b8d258a..bed086f 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_prepare_input_color_pass.hlsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_prepare_input_color_pass.hlsl @@ -22,7 +22,7 @@ // FSR2 pass 1 // SRV 1 : m_HDR : r_input_color_jittered // SRV 4 : FSR2_Exposure : r_exposure -// UAV 7 : FSR2_ReconstructedPrevNearestDepth : rw_ReconstructedPrevNearestDepth +// UAV 7 : FSR2_ReconstructedPrevNearestDepth : rw_reconstructed_previous_nearest_depth // UAV 13 : FSR2_PreparedInputColor : rw_prepared_input_color // UAV 14 : FSR2_LumaHistory : rw_luma_history // CB 0 : cbFSR2 diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_rcas_pass.glsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_rcas_pass.glsl index d437bb9..1097faf 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_rcas_pass.glsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_rcas_pass.glsl @@ -89,4 +89,4 @@ FFX_FSR2_NUM_THREADS void main() { RCAS(gl_LocalInvocationID.xyz, gl_WorkGroupID.xyz, gl_GlobalInvocationID.xyz); -} +} \ No newline at end of file diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_reconstruct_dilated_velocity_and_previous_depth.h b/src/ffx-fsr2-api/shaders/ffx_fsr2_reconstruct_dilated_velocity_and_previous_depth.h index 655fa13..aad1992 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_reconstruct_dilated_velocity_and_previous_depth.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_reconstruct_dilated_velocity_and_previous_depth.h @@ -22,11 +22,11 @@ #ifndef FFX_FSR2_RECONSTRUCT_DILATED_VELOCITY_AND_PREVIOUS_DEPTH_H #define FFX_FSR2_RECONSTRUCT_DILATED_VELOCITY_AND_PREVIOUS_DEPTH_H -void ReconstructPrevDepth(FFX_MIN16_I2 iPxPos, FfxFloat32 fDepth, FfxFloat32x2 fMotionVector, FFX_MIN16_I2 iPxDepthSize) +void ReconstructPrevDepth(FfxInt32x2 iPxPos, FfxFloat32 fDepth, FfxFloat32x2 fMotionVector, FfxInt32x2 iPxDepthSize) { - FfxFloat32x2 fDepthUv = (FfxFloat32x2(iPxPos) + 0.5f) / iPxDepthSize; - FfxFloat32x2 fPxPrevPos = (fDepthUv + fMotionVector) * FfxFloat32x2(iPxDepthSize)-0.5f; - FFX_MIN16_I2 iPxPrevPos = FFX_MIN16_I2(floor(fPxPrevPos)); + FfxFloat32x2 fDepthUv = (iPxPos + FfxFloat32(0.5)) / iPxDepthSize; + FfxFloat32x2 fPxPrevPos = (fDepthUv + fMotionVector) * iPxDepthSize - FfxFloat32x2(0.5, 0.5); + FfxInt32x2 iPxPrevPos = FfxInt32x2(floor(fPxPrevPos)); FfxFloat32x2 fPxFrac = ffxFract(fPxPrevPos); const FfxFloat32 bilinearWeights[2][2] = { @@ -45,12 +45,12 @@ void ReconstructPrevDepth(FFX_MIN16_I2 iPxPos, FfxFloat32 fDepth, FfxFloat32x2 f for (FfxInt32 y = 0; y <= 1; ++y) { for (FfxInt32 x = 0; x <= 1; ++x) { - FFX_MIN16_I2 offset = FFX_MIN16_I2(x, y); + FfxInt32x2 offset = FfxInt32x2(x, y); FfxFloat32 w = bilinearWeights[y][x]; if (w > reconstructedDepthBilinearWeightThreshold) { - FFX_MIN16_I2 storePos = iPxPrevPos + offset; + FfxInt32x2 storePos = iPxPrevPos + offset; if (IsOnScreen(storePos, iPxDepthSize)) { StoreReconstructedDepth(storePos, fDepth); } @@ -59,19 +59,19 @@ void ReconstructPrevDepth(FFX_MIN16_I2 iPxPos, FfxFloat32 fDepth, FfxFloat32x2 f } } -void FindNearestDepth(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX_MIN16_I2 iPxSize, FFX_PARAMETER_OUT FfxFloat32 fNearestDepth, FFX_PARAMETER_OUT FFX_MIN16_I2 fNearestDepthCoord) +void FindNearestDepth(FFX_PARAMETER_IN FfxInt32x2 iPxPos, FFX_PARAMETER_IN FfxInt32x2 iPxSize, FFX_PARAMETER_OUT FfxFloat32 fNearestDepth, FFX_PARAMETER_OUT FfxInt32x2 fNearestDepthCoord) { const FfxInt32 iSampleCount = 9; - const FFX_MIN16_I2 iSampleOffsets[iSampleCount] = { - FFX_MIN16_I2(+0, +0), - FFX_MIN16_I2(+1, +0), - FFX_MIN16_I2(+0, +1), - FFX_MIN16_I2(+0, -1), - FFX_MIN16_I2(-1, +0), - FFX_MIN16_I2(-1, +1), - FFX_MIN16_I2(+1, +1), - FFX_MIN16_I2(-1, -1), - FFX_MIN16_I2(+1, -1), + const FfxInt32x2 iSampleOffsets[iSampleCount] = { + FfxInt32x2(+0, +0), + FfxInt32x2(+1, +0), + FfxInt32x2(+0, +1), + FfxInt32x2(+0, -1), + FfxInt32x2(-1, +0), + FfxInt32x2(-1, +1), + FfxInt32x2(+1, +1), + FfxInt32x2(-1, -1), + FfxInt32x2(+1, -1), }; // pull out the depth loads to allow SC to batch them @@ -80,7 +80,7 @@ void FindNearestDepth(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX FFX_UNROLL for (iSampleIndex = 0; iSampleIndex < iSampleCount; ++iSampleIndex) { - FFX_MIN16_I2 iPos = iPxPos + iSampleOffsets[iSampleIndex]; + FfxInt32x2 iPos = iPxPos + iSampleOffsets[iSampleIndex]; depth[iSampleIndex] = LoadInputDepth(iPos); } @@ -90,7 +90,7 @@ void FindNearestDepth(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX FFX_UNROLL for (iSampleIndex = 1; iSampleIndex < iSampleCount; ++iSampleIndex) { - FFX_MIN16_I2 iPos = iPxPos + iSampleOffsets[iSampleIndex]; + FfxInt32x2 iPos = iPxPos + iSampleOffsets[iSampleIndex]; if (IsOnScreen(iPos, iPxSize)) { FfxFloat32 fNdDepth = depth[iSampleIndex]; @@ -106,30 +106,97 @@ void FindNearestDepth(FFX_PARAMETER_IN FFX_MIN16_I2 iPxPos, FFX_PARAMETER_IN FFX } } -void ReconstructPrevDepthAndDilateMotionVectors(FFX_MIN16_I2 iPxLrPos) +FfxFloat32 ComputeMotionDivergence(FfxInt32x2 iPxPos, FfxInt32x2 iPxInputMotionVectorSize) { - FFX_MIN16_I2 iPxLrSize = FFX_MIN16_I2(RenderSize()); - FFX_MIN16_I2 iPxHrSize = FFX_MIN16_I2(DisplaySize()); + FfxFloat32 minconvergence = 1.0f; + FfxFloat32x2 fMotionVectorNucleus = LoadInputMotionVector(iPxPos) * RenderSize(); + FfxFloat32 fNucleusVelocity = length(fMotionVectorNucleus); + + const FfxFloat32 MotionVectorVelocityEpsilon = 1e-02f; + + if (fNucleusVelocity > MotionVectorVelocityEpsilon) { + for (FfxInt32 y = -1; y <= 1; ++y) { + for (FfxInt32 x = -1; x <= 1; ++x) { + + FfxInt32x2 sp = ClampLoad(iPxPos, FfxInt32x2(x, y), iPxInputMotionVectorSize); + + FfxFloat32x2 fMotionVector = LoadInputMotionVector(sp) * RenderSize(); + FfxFloat32 fVelocity = length(fMotionVector); + + fVelocity = ffxMax(fVelocity, fNucleusVelocity); + minconvergence = ffxMin(minconvergence, dot(fMotionVector / fVelocity, fMotionVectorNucleus / fVelocity)); + } + } + } + + return ffxSaturate(1.0f - minconvergence); +} + + +void PreProcessReactiveMasks(FfxInt32x2 iPxLrPos, FfxFloat32 fMotionDivergence) +{ + // Compensate for bilinear sampling in accumulation pass + + FfxFloat32x3 fReferenceColor = LoadPreparedInputColor(iPxLrPos); + FfxFloat32x2 fReactiveFactor = FfxFloat32x2(0.0f, fMotionDivergence); + + for (int y = -1; y < 2; y++) { + for (int x = -1; x < 2; x++) { + + const FfxInt32x2 sampleCoord = ClampLoad(iPxLrPos, FfxInt32x2(x, y), FfxInt32x2(RenderSize())); + + FfxFloat32x3 fColorSample = LoadPreparedInputColor(sampleCoord); + FfxFloat32 fReactiveSample = LoadReactiveMask(sampleCoord); + FfxFloat32 fTransparencyAndCompositionSample = LoadTransparencyAndCompositionMask(sampleCoord); + + const FfxFloat32 fColorSimilarity = dot(normalize(fReferenceColor), normalize(fColorSample)); + const FfxFloat32 fVelocitySimilarity = 1.0f - abs(length(fReferenceColor) - length(fColorSample)); + const FfxFloat32 fSimilarity = fColorSimilarity * fVelocitySimilarity; + + // Increase power for non-similar samples + const FfxFloat32 fPowerBiasMax = 6.0f; + const FfxFloat32 fSimilarityPower = 1.0f + (fPowerBiasMax - fSimilarity * fPowerBiasMax); + const FfxFloat32 fWeightedReactiveSample = ffxPow(fReactiveSample, fSimilarityPower); + const FfxFloat32 fWeightedTransparencyAndCompositionSample = ffxPow(fTransparencyAndCompositionSample, fSimilarityPower); + + fReactiveFactor = ffxMax(fReactiveFactor, FfxFloat32x2(fWeightedReactiveSample, fWeightedTransparencyAndCompositionSample)); + } + } + + StoreDilatedReactiveMasks(iPxLrPos, fReactiveFactor); +} + +void ReconstructAndDilate(FfxInt32x2 iPxLrPos) +{ FfxFloat32 fDilatedDepth; - FFX_MIN16_I2 iNearestDepthCoord; + FfxInt32x2 iNearestDepthCoord; - FindNearestDepth(iPxLrPos, iPxLrSize, fDilatedDepth, iNearestDepthCoord); + FindNearestDepth(iPxLrPos, RenderSize(), fDilatedDepth, iNearestDepthCoord); #if FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS - FfxFloat32x2 fDilatedMotionVector = LoadInputMotionVector(iNearestDepthCoord); + FfxInt32x2 iSamplePos = iPxLrPos; + FfxInt32x2 iMotionVectorPos = iNearestDepthCoord; #else - FfxFloat32x2 fSrcJitteredPos = FfxFloat32x2(iNearestDepthCoord) + 0.5f - Jitter(); - FfxFloat32x2 fLrPosInHr = (fSrcJitteredPos / iPxLrSize) * iPxHrSize; - FfxFloat32x2 fHrPos = floor(fLrPosInHr) + 0.5; - - FfxFloat32x2 fDilatedMotionVector = LoadInputMotionVector(FFX_MIN16_I2(fHrPos)); + FfxInt32x2 iSamplePos = ComputeHrPosFromLrPos(iPxLrPos); + FfxInt32x2 iMotionVectorPos = ComputeHrPosFromLrPos(iNearestDepthCoord); #endif + FfxFloat32x2 fDilatedMotionVector = LoadInputMotionVector(iMotionVectorPos); + StoreDilatedDepth(iPxLrPos, fDilatedDepth); StoreDilatedMotionVector(iPxLrPos, fDilatedMotionVector); - ReconstructPrevDepth(iPxLrPos, fDilatedDepth, fDilatedMotionVector, iPxLrSize); + ReconstructPrevDepth(iPxLrPos, fDilatedDepth, fDilatedMotionVector, RenderSize()); + +#if FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS + FfxFloat32 fMotionDivergence = ComputeMotionDivergence(iSamplePos, RenderSize()); +#else + FfxFloat32 fMotionDivergence = ComputeMotionDivergence(iSamplePos, DisplaySize()); +#endif + + PreProcessReactiveMasks(iPxLrPos, fMotionDivergence); } + #endif //!defined( FFX_FSR2_RECONSTRUCT_DILATED_VELOCITY_AND_PREVIOUS_DEPTH_H ) diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl index 7579c49..96d1383 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_reconstruct_previous_depth_pass.glsl @@ -22,7 +22,7 @@ // FSR2 pass 2 // SRV 2 : m_MotionVector : r_motion_vectors // SRV 3 : m_depthbuffer : r_depth -// UAV 7 : FSR2_ReconstructedPrevNearestDepth : rw_ReconstructedPrevNearestDepth +// UAV 7 : FSR2_ReconstructedPrevNearestDepth : rw_reconstructed_previous_nearest_depth // UAV 8 : FSR2_DilatedVelocity : rw_dilated_motion_vectors // UAV 9 : FSR2_DilatedDepth : rw_dilatedDepth // CB 0 : cbFSR2 @@ -34,10 +34,14 @@ #define FSR2_BIND_SRV_MOTION_VECTORS 0 #define FSR2_BIND_SRV_DEPTH 1 -#define FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH 2 -#define FSR2_BIND_UAV_DILATED_MOTION_VECTORS 3 -#define FSR2_BIND_UAV_DILATED_DEPTH 4 -#define FSR2_BIND_CB_FSR2 5 +#define FSR2_BIND_SRV_REACTIVE_MASK 2 +#define FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK 3 +#define FSR2_BIND_SRV_PREPARED_INPUT_COLOR 4 +#define FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH 5 +#define FSR2_BIND_UAV_DILATED_MOTION_VECTORS 6 +#define FSR2_BIND_UAV_DILATED_DEPTH 7 +#define FSR2_BIND_UAV_DILATED_REACTIVE_MASKS 8 +#define FSR2_BIND_CB_FSR2 9 #include "ffx_fsr2_callbacks_glsl.h" #include "ffx_fsr2_common.h" @@ -60,5 +64,5 @@ FFX_FSR2_NUM_THREADS void main() { - ReconstructPrevDepthAndDilateMotionVectors(FFX_MIN16_I2(gl_GlobalInvocationID.xy)); + ReconstructAndDilate(FFX_MIN16_I2(gl_GlobalInvocationID.xy)); } diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_reconstruct_previous_depth_pass.hlsl b/src/ffx-fsr2-api/shaders/ffx_fsr2_reconstruct_previous_depth_pass.hlsl index 21825cb..57f3f49 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_reconstruct_previous_depth_pass.hlsl +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_reconstruct_previous_depth_pass.hlsl @@ -22,16 +22,20 @@ // FSR2 pass 2 // SRV 2 : m_MotionVector : r_motion_vectors // SRV 3 : m_depthbuffer : r_depth -// UAV 7 : FSR2_ReconstructedPrevNearestDepth : rw_ReconstructedPrevNearestDepth +// UAV 7 : FSR2_ReconstructedPrevNearestDepth : rw_reconstructed_previous_nearest_depth // UAV 8 : FSR2_DilatedVelocity : rw_dilated_motion_vectors // UAV 9 : FSR2_DilatedDepth : rw_dilatedDepth // CB 0 : cbFSR2 #define FSR2_BIND_SRV_MOTION_VECTORS 0 #define FSR2_BIND_SRV_DEPTH 1 +#define FSR2_BIND_SRV_REACTIVE_MASK 2 +#define FSR2_BIND_SRV_TRANSPARENCY_AND_COMPOSITION_MASK 3 +#define FSR2_BIND_SRV_PREPARED_INPUT_COLOR 4 #define FSR2_BIND_UAV_RECONSTRUCTED_PREV_NEAREST_DEPTH 0 #define FSR2_BIND_UAV_DILATED_MOTION_VECTORS 1 #define FSR2_BIND_UAV_DILATED_DEPTH 2 +#define FSR2_BIND_UAV_DILATED_REACTIVE_MASKS 3 #define FSR2_BIND_CB_FSR2 0 #include "ffx_fsr2_callbacks_hlsl.h" @@ -56,11 +60,11 @@ FFX_FSR2_PREFER_WAVE64 FFX_FSR2_NUM_THREADS FFX_FSR2_EMBED_ROOTSIG_CONTENT void CS( - min16int2 iGroupId : SV_GroupID, - min16int2 iDispatchThreadId : SV_DispatchThreadID, - min16int2 iGroupThreadId : SV_GroupThreadID, + int2 iGroupId : SV_GroupID, + int2 iDispatchThreadId : SV_DispatchThreadID, + int2 iGroupThreadId : SV_GroupThreadID, int iGroupIndex : SV_GroupIndex ) { - ReconstructPrevDepthAndDilateMotionVectors(iDispatchThreadId); + ReconstructAndDilate(iDispatchThreadId); } diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_reproject.h b/src/ffx-fsr2-api/shaders/ffx_fsr2_reproject.h index 3fceafd..5ae962d 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_reproject.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_reproject.h @@ -22,31 +22,62 @@ #ifndef FFX_FSR2_REPROJECT_H #define FFX_FSR2_REPROJECT_H -FFX_MIN16_F4 WrapHistory(FfxInt32x2 iPxSample) +#ifndef FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE +#define FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE 1 // Approximate +#endif + +FfxFloat32x4 WrapHistory(FfxInt32x2 iPxSample) { return LoadHistory(iPxSample); } -DeclareCustomFetchBicubicSamplesMin16(FetchHistorySamples, WrapHistory) -DeclareCustomTextureSample(HistorySample, Lanczos2, FetchHistorySamples) - - -FFX_MIN16_F4 WrapLockStatus(FfxInt32x2 iPxSample) +#if FFX_HALF +FFX_MIN16_F4 WrapHistory(FFX_MIN16_I2 iPxSample) { - return FFX_MIN16_F4(LoadLockStatus(FFX_MIN16_I2(iPxSample)), 0); + return FFX_MIN16_F4(LoadHistory(iPxSample)); } - -#if 1 -DeclareCustomFetchBilinearSamples(FetchLockStatusSamples, WrapLockStatus) -DeclareCustomTextureSample(LockStatusSample, Bilinear, FetchLockStatusSamples) -#else -DeclareCustomFetchBicubicSamplesMin16(FetchLockStatusSamples, WrapLockStatus) -DeclareCustomTextureSample(LockStatusSample, Lanczos2, FetchLockStatusSamples) #endif +#if FFX_FSR2_OPTION_REPROJECT_SAMPLERS_USE_DATA_HALF && FFX_HALF +DeclareCustomFetchBicubicSamplesMin16(FetchHistorySamples, WrapHistory) +DeclareCustomTextureSampleMin16(HistorySample, FFX_FSR2_GET_LANCZOS_SAMPLER1D(FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchHistorySamples) +#else +DeclareCustomFetchBicubicSamples(FetchHistorySamples, WrapHistory) +DeclareCustomTextureSample(HistorySample, FFX_FSR2_GET_LANCZOS_SAMPLER1D(FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchHistorySamples) +#endif -FfxFloat32x2 GetMotionVector(FFX_MIN16_I2 iPxHrPos, FfxFloat32x2 fHrUv) +FfxFloat32x4 WrapLockStatus(FfxInt32x2 iPxSample) +{ + return FfxFloat32x4(LoadLockStatus(iPxSample), 0.0f); +} + +#if FFX_HALF +FFX_MIN16_F4 WrapLockStatus(FFX_MIN16_I2 iPxSample) +{ + return FFX_MIN16_F4(LoadLockStatus(iPxSample), 0.0f); +} +#endif + +#if 1 +#if FFX_FSR2_OPTION_REPROJECT_SAMPLERS_USE_DATA_HALF && FFX_HALF +DeclareCustomFetchBilinearSamplesMin16(FetchLockStatusSamples, WrapLockStatus) +DeclareCustomTextureSampleMin16(LockStatusSample, Bilinear, FetchLockStatusSamples) +#else +DeclareCustomFetchBilinearSamples(FetchLockStatusSamples, WrapLockStatus) +DeclareCustomTextureSample(LockStatusSample, Bilinear, FetchLockStatusSamples) +#endif +#else +#if FFX_FSR2_OPTION_REPROJECT_SAMPLERS_USE_DATA_HALF && FFX_HALF +DeclareCustomFetchBicubicSamplesMin16(FetchLockStatusSamples, WrapLockStatus) +DeclareCustomTextureSampleMin16(LockStatusSample, FFX_FSR2_GET_LANCZOS_SAMPLER1D(FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchLockStatusSamples) +#else +DeclareCustomFetchBicubicSamples(FetchLockStatusSamples, WrapLockStatus) +DeclareCustomTextureSample(LockStatusSample, FFX_FSR2_GET_LANCZOS_SAMPLER1D(FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE), FetchLockStatusSamples) +#endif +#endif + +FfxFloat32x2 GetMotionVector(FfxInt32x2 iPxHrPos, FfxFloat32x2 fHrUv) { #if FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS FfxFloat32x2 fDilatedMotionVector = LoadDilatedMotionVector(FFX_MIN16_I2(fHrUv * RenderSize())); @@ -78,10 +109,10 @@ void ReprojectHistoryColor(FfxInt32x2 iPxHrPos, FfxFloat32x2 fReprojectedHrUv, F fHistoryColorAndWeight.rgb = RGBToYCoCg(fHistoryColorAndWeight.rgb); } -void ReprojectHistoryLockStatus(FfxInt32x2 iPxHrPos, FfxFloat32x2 fReprojectedHrUv, FFX_PARAMETER_OUT LOCK_STATUS_T fReprojectedLockStatus) +void ReprojectHistoryLockStatus(FfxInt32x2 iPxHrPos, FfxFloat32x2 fReprojectedHrUv, FFX_PARAMETER_OUT FfxFloat32x3 fReprojectedLockStatus) { // If function is called from Accumulate pass, we need to treat locks differently - LOCK_STATUS_F1 fInPlaceLockLifetime = LoadRwLockStatus(iPxHrPos)[LOCK_LIFETIME_REMAINING]; + FfxFloat32 fInPlaceLockLifetime = LoadRwLockStatus(iPxHrPos)[LOCK_LIFETIME_REMAINING]; fReprojectedLockStatus = SampleLockStatus(fReprojectedHrUv); @@ -90,4 +121,5 @@ void ReprojectHistoryLockStatus(FfxInt32x2 iPxHrPos, FfxFloat32x2 fReprojectedHr fReprojectedLockStatus[LOCK_LIFETIME_REMAINING] = fInPlaceLockLifetime; } } + #endif //!defined( FFX_FSR2_REPROJECT_H ) diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_resources.h b/src/ffx-fsr2-api/shaders/ffx_fsr2_resources.h index 2309351..89734f6 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_resources.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_resources.h @@ -50,7 +50,7 @@ #define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DEFAULT_REACTIVITY 24 #define FFX_FSR2_RESOURCE_IDENTIFIER_INTERNAL_DEFAULT_TRANSPARENCY_AND_COMPOSITION 25 #define FFX_FSR2_RESOURCE_IDENTITIER_UPSAMPLE_MAXIMUM_BIAS_LUT 26 -#define FFX_FSR2_RESOURCE_IDENTIFIER_REACTIVE_MAX 27 +#define FFX_FSR2_RESOURCE_IDENTIFIER_DILATED_REACTIVE_MASKS 27 #define FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE 28 // same as FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_0 #define FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_0 28 #define FFX_FSR2_RESOURCE_IDENTIFIER_AUTO_EXPOSURE_MIPMAP_1 29 @@ -74,7 +74,6 @@ #define FFX_FSR2_RESOURCE_IDENTIFIER_COUNT 43 - #define FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_FSR2 0 #define FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_SPD 1 #define FFX_FSR2_CONSTANTBUFFER_IDENTIFIER_RCAS 2 diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_sample.h b/src/ffx-fsr2-api/shaders/ffx_fsr2_sample.h index 95fa51d..f697d70 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_sample.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_sample.h @@ -60,30 +60,40 @@ struct FetchedBicubicSamples { }; #if FFX_HALF +struct FetchedBilinearSamplesMin16 { + + FFX_MIN16_F4 fColor00; + FFX_MIN16_F4 fColor10; + + FFX_MIN16_F4 fColor01; + FFX_MIN16_F4 fColor11; +}; + struct FetchedBicubicSamplesMin16 { - FfxFloat16x4 fColor00; - FfxFloat16x4 fColor10; - FfxFloat16x4 fColor20; - FfxFloat16x4 fColor30; + FFX_MIN16_F4 fColor00; + FFX_MIN16_F4 fColor10; + FFX_MIN16_F4 fColor20; + FFX_MIN16_F4 fColor30; - FfxFloat16x4 fColor01; - FfxFloat16x4 fColor11; - FfxFloat16x4 fColor21; - FfxFloat16x4 fColor31; + FFX_MIN16_F4 fColor01; + FFX_MIN16_F4 fColor11; + FFX_MIN16_F4 fColor21; + FFX_MIN16_F4 fColor31; - FfxFloat16x4 fColor02; - FfxFloat16x4 fColor12; - FfxFloat16x4 fColor22; - FfxFloat16x4 fColor32; + FFX_MIN16_F4 fColor02; + FFX_MIN16_F4 fColor12; + FFX_MIN16_F4 fColor22; + FFX_MIN16_F4 fColor32; - FfxFloat16x4 fColor03; - FfxFloat16x4 fColor13; - FfxFloat16x4 fColor23; - FfxFloat16x4 fColor33; + FFX_MIN16_F4 fColor03; + FFX_MIN16_F4 fColor13; + FFX_MIN16_F4 fColor23; + FFX_MIN16_F4 fColor33; }; #else //FFX_HALF #define FetchedBicubicSamplesMin16 FetchedBicubicSamples +#define FetchedBilinearSamplesMin16 FetchedBilinearSamples #endif //FFX_HALF FfxFloat32x4 Linear(FfxFloat32x4 A, FfxFloat32x4 B, FfxFloat32 t) @@ -99,39 +109,44 @@ FfxFloat32x4 Bilinear(FetchedBilinearSamples BilinearSamples, FfxFloat32x2 fPxFr return fColorXY; } -// SEE: ../Interpolation/CatmullRom.ipynb, t=0 -> B, t=1 -> C -FfxFloat32x4 CubicCatmullRom(FfxFloat32x4 A, FfxFloat32x4 B, FfxFloat32x4 C, FfxFloat32x4 D, FfxFloat32 t) +#if FFX_HALF +FFX_MIN16_F4 Linear(FFX_MIN16_F4 A, FFX_MIN16_F4 B, FFX_MIN16_F t) { - FfxFloat32 t2 = t * t; - FfxFloat32 t3 = t * t * t; - FfxFloat32x4 a = -A / 2.f + (3.f * B) / 2.f - (3.f * C) / 2.f + D / 2.f; - FfxFloat32x4 b = A - (5.f * B) / 2.f + 2.f * C - D / 2.f; - FfxFloat32x4 c = -A / 2.f + C / 2.f; - FfxFloat32x4 d = B; - return a * t3 + b * t2 + c * t + d; + return A + (B - A) * t; } -FfxFloat32x4 BicubicCatmullRom(FetchedBicubicSamples BicubicSamples, FfxFloat32x2 fPxFrac) +FFX_MIN16_F4 Bilinear(FetchedBilinearSamplesMin16 BilinearSamples, FFX_MIN16_F2 fPxFrac) { - FfxFloat32x4 fColorX0 = CubicCatmullRom(BicubicSamples.fColor00, BicubicSamples.fColor10, BicubicSamples.fColor20, BicubicSamples.fColor30, fPxFrac.x); - FfxFloat32x4 fColorX1 = CubicCatmullRom(BicubicSamples.fColor01, BicubicSamples.fColor11, BicubicSamples.fColor21, BicubicSamples.fColor31, fPxFrac.x); - FfxFloat32x4 fColorX2 = CubicCatmullRom(BicubicSamples.fColor02, BicubicSamples.fColor12, BicubicSamples.fColor22, BicubicSamples.fColor32, fPxFrac.x); - FfxFloat32x4 fColorX3 = CubicCatmullRom(BicubicSamples.fColor03, BicubicSamples.fColor13, BicubicSamples.fColor23, BicubicSamples.fColor33, fPxFrac.x); - FfxFloat32x4 fColorXY = CubicCatmullRom(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); + FFX_MIN16_F4 fColorX0 = Linear(BilinearSamples.fColor00, BilinearSamples.fColor10, fPxFrac.x); + FFX_MIN16_F4 fColorX1 = Linear(BilinearSamples.fColor01, BilinearSamples.fColor11, fPxFrac.x); + FFX_MIN16_F4 fColorXY = Linear(fColorX0, fColorX1, fPxFrac.y); return fColorXY; } +#endif -FfxFloat32 Lanczos2(FfxFloat32 x) +FfxFloat32 Lanczos2NoClamp(FfxFloat32 x) { const FfxFloat32 PI = 3.141592653589793f; // TODO: share SDK constants return abs(x) < FSR2_EPSILON ? 1.f : (sin(PI * x) / (PI * x)) * (sin(0.5f * PI * x) / (0.5f * PI * x)); } -#if FFX_HALF -FfxFloat16 Lanczos2(FfxFloat16 x) +FfxFloat32 Lanczos2(FfxFloat32 x) { - const FFX_MIN16_F PI = FfxFloat16(3.141592653589793f); // TODO: share SDK constants - return abs(x) < FSR2_EPSILON ? FfxFloat16(1.f) : (sin(PI * x) / (PI * x)) * (sin(FfxFloat16(0.5f) * PI * x) / (FfxFloat16(0.5f) * PI * x)); + x = ffxMin(abs(x), 2.0f); + return Lanczos2NoClamp(x); +} + +#if FFX_HALF +FFX_MIN16_F Lanczos2NoClamp(FFX_MIN16_F x) +{ + const FFX_MIN16_F PI = FFX_MIN16_F(3.141592653589793f); // TODO: share SDK constants + return abs(x) < FFX_MIN16_F(FSR2_EPSILON) ? FFX_MIN16_F(1.f) : (sin(PI * x) / (PI * x)) * (sin(FFX_MIN16_F(0.5f) * PI * x) / (FFX_MIN16_F(0.5f) * PI * x)); +} + +FFX_MIN16_F Lanczos2(FFX_MIN16_F x) +{ + x = ffxMin(abs(x), FFX_MIN16_F(2.0f)); + return Lanczos2NoClamp(x); } #endif //FFX_HALF @@ -144,11 +159,11 @@ FfxFloat32 Lanczos2ApproxSqNoClamp(FfxFloat32 x2) } #if FFX_HALF -FfxFloat16 Lanczos2ApproxSqNoClamp(FfxFloat16 x2) +FFX_MIN16_F Lanczos2ApproxSqNoClamp(FFX_MIN16_F x2) { - FfxFloat16 a = FfxFloat16(2.0f / 5.0f) * x2 - FfxFloat16(1); - FfxFloat16 b = FfxFloat16(1.0f / 4.0f) * x2 - FfxFloat16(1); - return (FfxFloat16(25.0f / 16.0f) * a * a - FfxFloat16(25.0f / 16.0f - 1)) * (b * b); + FFX_MIN16_F a = FFX_MIN16_F(2.0f / 5.0f) * x2 - FFX_MIN16_F(1); + FFX_MIN16_F b = FFX_MIN16_F(1.0f / 4.0f) * x2 - FFX_MIN16_F(1); + return (FFX_MIN16_F(25.0f / 16.0f) * a * a - FFX_MIN16_F(25.0f / 16.0f - 1)) * (b * b); } #endif //FFX_HALF @@ -159,9 +174,9 @@ FfxFloat32 Lanczos2ApproxSq(FfxFloat32 x2) } #if FFX_HALF -FfxFloat16 Lanczos2ApproxSq(FfxFloat16 x2) +FFX_MIN16_F Lanczos2ApproxSq(FFX_MIN16_F x2) { - x2 = ffxMin(x2, FfxFloat16(4.0f)); + x2 = ffxMin(x2, FFX_MIN16_F(4.0f)); return Lanczos2ApproxSqNoClamp(x2); } #endif //FFX_HALF @@ -172,7 +187,7 @@ FfxFloat32 Lanczos2ApproxNoClamp(FfxFloat32 x) } #if FFX_HALF -FfxFloat16 Lanczos2ApproxNoClamp(FfxFloat16 x) +FFX_MIN16_F Lanczos2ApproxNoClamp(FFX_MIN16_F x) { return Lanczos2ApproxSqNoClamp(x * x); } @@ -184,7 +199,7 @@ FfxFloat32 Lanczos2Approx(FfxFloat32 x) } #if FFX_HALF -FfxFloat16 Lanczos2Approx(FfxFloat16 x) +FFX_MIN16_F Lanczos2Approx(FFX_MIN16_F x) { return Lanczos2ApproxSq(x * x); } @@ -196,14 +211,13 @@ FfxFloat32 Lanczos2_UseLUT(FfxFloat32 x) } #if FFX_HALF -FfxFloat16 Lanczos2_UseLUT(FfxFloat16 x) +FFX_MIN16_F Lanczos2_UseLUT(FFX_MIN16_F x) { - return SampleLanczos2Weight(abs(x)); + return FFX_MIN16_F(SampleLanczos2Weight(abs(x))); } #endif //FFX_HALF -#if FFX_FSR2_OPTION_USE_LANCZOS_LUT -FfxFloat32x4 Lanczos2_AllowLUT(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFloat32x4 fColor2, FfxFloat32x4 fColor3, FfxFloat32 t) +FfxFloat32x4 Lanczos2_UseLUT(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFloat32x4 fColor2, FfxFloat32x4 fColor3, FfxFloat32 t) { FfxFloat32 fWeight0 = Lanczos2_UseLUT(-1.f - t); FfxFloat32 fWeight1 = Lanczos2_UseLUT(-0.f - t); @@ -212,18 +226,15 @@ FfxFloat32x4 Lanczos2_AllowLUT(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFl return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3); } #if FFX_HALF -FfxFloat16x4 Lanczos2_AllowLUT(FfxFloat16x4 fColor0, FfxFloat16x4 fColor1, FfxFloat16x4 fColor2, FfxFloat16x4 fColor3, FfxFloat16 t) +FFX_MIN16_F4 Lanczos2_UseLUT(FFX_MIN16_F4 fColor0, FFX_MIN16_F4 fColor1, FFX_MIN16_F4 fColor2, FFX_MIN16_F4 fColor3, FFX_MIN16_F t) { - FfxFloat16 fWeight0 = Lanczos2_UseLUT(FfxFloat16(-1.f) - t); - FfxFloat16 fWeight1 = Lanczos2_UseLUT(FfxFloat16(-0.f) - t); - FfxFloat16 fWeight2 = Lanczos2_UseLUT(FfxFloat16(+1.f) - t); - FfxFloat16 fWeight3 = Lanczos2_UseLUT(FfxFloat16(+2.f) - t); + FFX_MIN16_F fWeight0 = Lanczos2_UseLUT(FFX_MIN16_F(-1.f) - t); + FFX_MIN16_F fWeight1 = Lanczos2_UseLUT(FFX_MIN16_F(-0.f) - t); + FFX_MIN16_F fWeight2 = Lanczos2_UseLUT(FFX_MIN16_F(+1.f) - t); + FFX_MIN16_F fWeight3 = Lanczos2_UseLUT(FFX_MIN16_F(+2.f) - t); return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3); } -#endif //FFX_HALF -#else //FFX_FSR2_OPTION_USE_LANCZOS_LUT -#define Lanczos2_AllowLUT Lanczos2 -#endif //FFX_FSR2_OPTION_USE_LANCZOS_LUT +#endif FfxFloat32x4 Lanczos2(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFloat32x4 fColor2, FfxFloat32x4 fColor3, FfxFloat32 t) { @@ -236,11 +247,11 @@ FfxFloat32x4 Lanczos2(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFloat32x4 f FfxFloat32x4 Lanczos2(FetchedBicubicSamples Samples, FfxFloat32x2 fPxFrac) { - FfxFloat32x4 fColorX0 = Lanczos2_AllowLUT(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); - FfxFloat32x4 fColorX1 = Lanczos2_AllowLUT(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); - FfxFloat32x4 fColorX2 = Lanczos2_AllowLUT(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); - FfxFloat32x4 fColorX3 = Lanczos2_AllowLUT(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); - FfxFloat32x4 fColorXY = Lanczos2_AllowLUT(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); + FfxFloat32x4 fColorX0 = Lanczos2(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); + FfxFloat32x4 fColorX1 = Lanczos2(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); + FfxFloat32x4 fColorX2 = Lanczos2(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); + FfxFloat32x4 fColorX3 = Lanczos2(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); + FfxFloat32x4 fColorXY = Lanczos2(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); // Deringing @@ -269,36 +280,36 @@ FfxFloat32x4 Lanczos2(FetchedBicubicSamples Samples, FfxFloat32x2 fPxFrac) } #if FFX_HALF -FfxFloat16x4 Lanczos2(FfxFloat16x4 fColor0, FfxFloat16x4 fColor1, FfxFloat16x4 fColor2, FfxFloat16x4 fColor3, FfxFloat16 t) +FFX_MIN16_F4 Lanczos2(FFX_MIN16_F4 fColor0, FFX_MIN16_F4 fColor1, FFX_MIN16_F4 fColor2, FFX_MIN16_F4 fColor3, FFX_MIN16_F t) { - FfxFloat16 fWeight0 = Lanczos2(FfxFloat16(-1.f) - t); - FfxFloat16 fWeight1 = Lanczos2(FfxFloat16(-0.f) - t); - FfxFloat16 fWeight2 = Lanczos2(FfxFloat16(+1.f) - t); - FfxFloat16 fWeight3 = Lanczos2(FfxFloat16(+2.f) - t); + FFX_MIN16_F fWeight0 = Lanczos2(FFX_MIN16_F(-1.f) - t); + FFX_MIN16_F fWeight1 = Lanczos2(FFX_MIN16_F(-0.f) - t); + FFX_MIN16_F fWeight2 = Lanczos2(FFX_MIN16_F(+1.f) - t); + FFX_MIN16_F fWeight3 = Lanczos2(FFX_MIN16_F(+2.f) - t); return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3); } -FfxFloat16x4 Lanczos2(FetchedBicubicSamplesMin16 Samples, FFX_MIN16_F2 fPxFrac) +FFX_MIN16_F4 Lanczos2(FetchedBicubicSamplesMin16 Samples, FFX_MIN16_F2 fPxFrac) { - FfxFloat16x4 fColorX0 = Lanczos2_AllowLUT(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); - FfxFloat16x4 fColorX1 = Lanczos2_AllowLUT(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); - FfxFloat16x4 fColorX2 = Lanczos2_AllowLUT(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); - FfxFloat16x4 fColorX3 = Lanczos2_AllowLUT(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); - FfxFloat16x4 fColorXY = Lanczos2_AllowLUT(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); + FFX_MIN16_F4 fColorX0 = Lanczos2(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); + FFX_MIN16_F4 fColorX1 = Lanczos2(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); + FFX_MIN16_F4 fColorX2 = Lanczos2(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); + FFX_MIN16_F4 fColorX3 = Lanczos2(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); + FFX_MIN16_F4 fColorXY = Lanczos2(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); // Deringing // TODO: only use 4 by checking jitter const FfxInt32 iDeringingSampleCount = 4; - const FfxFloat16x4 fDeringingSamples[4] = { + const FFX_MIN16_F4 fDeringingSamples[4] = { Samples.fColor11, Samples.fColor21, Samples.fColor12, Samples.fColor22, }; - FfxFloat16x4 fDeringingMin = fDeringingSamples[0]; - FfxFloat16x4 fDeringingMax = fDeringingSamples[0]; + FFX_MIN16_F4 fDeringingMin = fDeringingSamples[0]; + FFX_MIN16_F4 fDeringingMax = fDeringingSamples[0]; FFX_UNROLL for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex) @@ -313,6 +324,79 @@ FfxFloat16x4 Lanczos2(FetchedBicubicSamplesMin16 Samples, FFX_MIN16_F2 fPxFrac) } #endif //FFX_HALF + +FfxFloat32x4 Lanczos2LUT(FetchedBicubicSamples Samples, FfxFloat32x2 fPxFrac) +{ + FfxFloat32x4 fColorX0 = Lanczos2_UseLUT(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); + FfxFloat32x4 fColorX1 = Lanczos2_UseLUT(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); + FfxFloat32x4 fColorX2 = Lanczos2_UseLUT(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); + FfxFloat32x4 fColorX3 = Lanczos2_UseLUT(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); + FfxFloat32x4 fColorXY = Lanczos2_UseLUT(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); + + // Deringing + + // TODO: only use 4 by checking jitter + const FfxInt32 iDeringingSampleCount = 4; + const FfxFloat32x4 fDeringingSamples[4] = { + Samples.fColor11, + Samples.fColor21, + Samples.fColor12, + Samples.fColor22, + }; + + FfxFloat32x4 fDeringingMin = fDeringingSamples[0]; + FfxFloat32x4 fDeringingMax = fDeringingSamples[0]; + + FFX_UNROLL + for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex) { + + fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]); + fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]); + } + + fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax); + + return fColorXY; +} + +#if FFX_HALF +FFX_MIN16_F4 Lanczos2LUT(FetchedBicubicSamplesMin16 Samples, FFX_MIN16_F2 fPxFrac) +{ + FFX_MIN16_F4 fColorX0 = Lanczos2_UseLUT(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); + FFX_MIN16_F4 fColorX1 = Lanczos2_UseLUT(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); + FFX_MIN16_F4 fColorX2 = Lanczos2_UseLUT(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); + FFX_MIN16_F4 fColorX3 = Lanczos2_UseLUT(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); + FFX_MIN16_F4 fColorXY = Lanczos2_UseLUT(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); + + // Deringing + + // TODO: only use 4 by checking jitter + const FfxInt32 iDeringingSampleCount = 4; + const FFX_MIN16_F4 fDeringingSamples[4] = { + Samples.fColor11, + Samples.fColor21, + Samples.fColor12, + Samples.fColor22, + }; + + FFX_MIN16_F4 fDeringingMin = fDeringingSamples[0]; + FFX_MIN16_F4 fDeringingMax = fDeringingSamples[0]; + + FFX_UNROLL + for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex) + { + fDeringingMin = ffxMin(fDeringingMin, fDeringingSamples[iSampleIndex]); + fDeringingMax = ffxMax(fDeringingMax, fDeringingSamples[iSampleIndex]); + } + + fColorXY = clamp(fColorXY, fDeringingMin, fDeringingMax); + + return fColorXY; +} +#endif //FFX_HALF + + + FfxFloat32x4 Lanczos2Approx(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFloat32x4 fColor2, FfxFloat32x4 fColor3, FfxFloat32 t) { FfxFloat32 fWeight0 = Lanczos2ApproxNoClamp(-1.f - t); @@ -323,12 +407,12 @@ FfxFloat32x4 Lanczos2Approx(FfxFloat32x4 fColor0, FfxFloat32x4 fColor1, FfxFloat } #if FFX_HALF -FfxFloat16x4 Lanczos2Approx(FfxFloat16x4 fColor0, FfxFloat16x4 fColor1, FfxFloat16x4 fColor2, FfxFloat16x4 fColor3, FfxFloat16 t) +FFX_MIN16_F4 Lanczos2Approx(FFX_MIN16_F4 fColor0, FFX_MIN16_F4 fColor1, FFX_MIN16_F4 fColor2, FFX_MIN16_F4 fColor3, FFX_MIN16_F t) { - FfxFloat16 fWeight0 = Lanczos2ApproxNoClamp(FfxFloat16(-1.f) - t); - FfxFloat16 fWeight1 = Lanczos2ApproxNoClamp(FfxFloat16(-0.f) - t); - FfxFloat16 fWeight2 = Lanczos2ApproxNoClamp(FfxFloat16(+1.f) - t); - FfxFloat16 fWeight3 = Lanczos2ApproxNoClamp(FfxFloat16(+2.f) - t); + FFX_MIN16_F fWeight0 = Lanczos2ApproxNoClamp(FFX_MIN16_F(-1.f) - t); + FFX_MIN16_F fWeight1 = Lanczos2ApproxNoClamp(FFX_MIN16_F(-0.f) - t); + FFX_MIN16_F fWeight2 = Lanczos2ApproxNoClamp(FFX_MIN16_F(+1.f) - t); + FFX_MIN16_F fWeight3 = Lanczos2ApproxNoClamp(FFX_MIN16_F(+2.f) - t); return (fWeight0 * fColor0 + fWeight1 * fColor1 + fWeight2 * fColor2 + fWeight3 * fColor3) / (fWeight0 + fWeight1 + fWeight2 + fWeight3); } #endif //FFX_HALF @@ -368,27 +452,27 @@ FfxFloat32x4 Lanczos2Approx(FetchedBicubicSamples Samples, FfxFloat32x2 fPxFrac) } #if FFX_HALF -FfxFloat16x4 Lanczos2Approx(FetchedBicubicSamplesMin16 Samples, FfxFloat16x2 fPxFrac) +FFX_MIN16_F4 Lanczos2Approx(FetchedBicubicSamplesMin16 Samples, FFX_MIN16_F2 fPxFrac) { - FfxFloat16x4 fColorX0 = Lanczos2Approx(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); - FfxFloat16x4 fColorX1 = Lanczos2Approx(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); - FfxFloat16x4 fColorX2 = Lanczos2Approx(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); - FfxFloat16x4 fColorX3 = Lanczos2Approx(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); - FfxFloat16x4 fColorXY = Lanczos2Approx(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); + FFX_MIN16_F4 fColorX0 = Lanczos2Approx(Samples.fColor00, Samples.fColor10, Samples.fColor20, Samples.fColor30, fPxFrac.x); + FFX_MIN16_F4 fColorX1 = Lanczos2Approx(Samples.fColor01, Samples.fColor11, Samples.fColor21, Samples.fColor31, fPxFrac.x); + FFX_MIN16_F4 fColorX2 = Lanczos2Approx(Samples.fColor02, Samples.fColor12, Samples.fColor22, Samples.fColor32, fPxFrac.x); + FFX_MIN16_F4 fColorX3 = Lanczos2Approx(Samples.fColor03, Samples.fColor13, Samples.fColor23, Samples.fColor33, fPxFrac.x); + FFX_MIN16_F4 fColorXY = Lanczos2Approx(fColorX0, fColorX1, fColorX2, fColorX3, fPxFrac.y); // Deringing // TODO: only use 4 by checking jitter const FfxInt32 iDeringingSampleCount = 4; - const FfxFloat16x4 fDeringingSamples[4] = { + const FFX_MIN16_F4 fDeringingSamples[4] = { Samples.fColor11, Samples.fColor21, Samples.fColor12, Samples.fColor22, }; - FfxFloat16x4 fDeringingMin = fDeringingSamples[0]; - FfxFloat16x4 fDeringingMax = fDeringingSamples[0]; + FFX_MIN16_F4 fDeringingMin = fDeringingSamples[0]; + FFX_MIN16_F4 fDeringingMax = fDeringingSamples[0]; FFX_UNROLL for (FfxInt32 iSampleIndex = 1; iSampleIndex < iDeringingSampleCount; ++iSampleIndex) @@ -401,94 +485,110 @@ FfxFloat16x4 Lanczos2Approx(FetchedBicubicSamplesMin16 Samples, FfxFloat16x2 fPx return fColorXY; } +#endif // Clamp by offset direction. Assuming iPxSample is already in range and iPxOffset is compile time constant. -FfxInt16x2 ClampLoadBicubic(FfxInt16x2 iPxSample, FfxInt16x2 iPxOffset, FfxInt16x2 iTextureSize) +FfxInt32x2 ClampCoord(FfxInt32x2 iPxSample, FfxInt32x2 iPxOffset, FfxInt32x2 iTextureSize) { - FfxInt16x2 result = iPxSample + iPxOffset; - result.x = (iPxOffset.x <= 0) ? ffxMax(result.x, FfxInt16(0)) : result.x; - result.x = (iPxOffset.x > 0) ? ffxMin(result.x, iTextureSize.x - FfxInt16(1)) : result.x; - result.y = (iPxOffset.y <= 0) ? ffxMax(result.y, FfxInt16(0)) : result.y; - result.y = (iPxOffset.y > 0) ? ffxMin(result.y, iTextureSize.y - FfxInt16(1)) : result.y; + FfxInt32x2 result = iPxSample + iPxOffset; + result.x = (iPxOffset.x < 0) ? ffxMax(result.x, 0) : result.x; + result.x = (iPxOffset.x > 0) ? ffxMin(result.x, iTextureSize.x - 1) : result.x; + result.y = (iPxOffset.y < 0) ? ffxMax(result.y, 0) : result.y; + result.y = (iPxOffset.y > 0) ? ffxMin(result.y, iTextureSize.y - 1) : result.y; + return result; +} +#if FFX_HALF +FFX_MIN16_I2 ClampCoord(FFX_MIN16_I2 iPxSample, FFX_MIN16_I2 iPxOffset, FFX_MIN16_I2 iTextureSize) +{ + FFX_MIN16_I2 result = iPxSample + iPxOffset; + result.x = (iPxOffset.x < FFX_MIN16_I(0)) ? ffxMax(result.x, FFX_MIN16_I(0)) : result.x; + result.x = (iPxOffset.x > FFX_MIN16_I(0)) ? ffxMin(result.x, iTextureSize.x - FFX_MIN16_I(1)) : result.x; + result.y = (iPxOffset.y < FFX_MIN16_I(0)) ? ffxMax(result.y, FFX_MIN16_I(0)) : result.y; + result.y = (iPxOffset.y > FFX_MIN16_I(0)) ? ffxMin(result.y, iTextureSize.y - FFX_MIN16_I(1)) : result.y; return result; } #endif //FFX_HALF -FfxInt32x2 ClampLoadBicubic(FfxInt32x2 iPxSample, FfxInt32x2 iPxOffset, FfxInt32x2 iTextureSize) -{ - FfxInt32x2 result = iPxSample + iPxOffset; - result.x = (iPxOffset.x <= 0) ? ffxMax(result.x, 0) : result.x; - result.x = (iPxOffset.x > 0) ? ffxMin(result.x, iTextureSize.x - 1) : result.x; - result.y = (iPxOffset.y <= 0) ? ffxMax(result.y, 0) : result.y; - result.y = (iPxOffset.y > 0) ? ffxMin(result.y, iTextureSize.y - 1) : result.y; - return result; -} -#define DeclareCustomFetchBicubicSamplesWithType(SampleType, AddrType, Name, LoadTexture) \ +#define DeclareCustomFetchBicubicSamplesWithType(SampleType, TextureType, AddrType, Name, LoadTexture) \ SampleType Name(AddrType iPxSample, AddrType iTextureSize) \ { \ SampleType Samples; \ \ - Samples.fColor00 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(-1, -1), iTextureSize)); \ - Samples.fColor10 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(+0, -1), iTextureSize)); \ - Samples.fColor20 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(+1, -1), iTextureSize)); \ - Samples.fColor30 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(+2, -1), iTextureSize)); \ + Samples.fColor00 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(-1, -1), iTextureSize))); \ + Samples.fColor10 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, -1), iTextureSize))); \ + Samples.fColor20 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, -1), iTextureSize))); \ + Samples.fColor30 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+2, -1), iTextureSize))); \ \ - Samples.fColor01 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(-1, +0), iTextureSize)); \ - Samples.fColor11 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(+0, +0), iTextureSize)); \ - Samples.fColor21 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(+1, +0), iTextureSize)); \ - Samples.fColor31 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(+2, +0), iTextureSize)); \ + Samples.fColor01 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(-1, +0), iTextureSize))); \ + Samples.fColor11 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +0), iTextureSize))); \ + Samples.fColor21 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +0), iTextureSize))); \ + Samples.fColor31 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+2, +0), iTextureSize))); \ \ - Samples.fColor02 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(-1, +1), iTextureSize)); \ - Samples.fColor12 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(+0, +1), iTextureSize)); \ - Samples.fColor22 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(+1, +1), iTextureSize)); \ - Samples.fColor32 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(+2, +1), iTextureSize)); \ + Samples.fColor02 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(-1, +1), iTextureSize))); \ + Samples.fColor12 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +1), iTextureSize))); \ + Samples.fColor22 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +1), iTextureSize))); \ + Samples.fColor32 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+2, +1), iTextureSize))); \ \ - Samples.fColor03 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(-1, +2), iTextureSize)); \ - Samples.fColor13 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(+0, +2), iTextureSize)); \ - Samples.fColor23 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(+1, +2), iTextureSize)); \ - Samples.fColor33 = LoadTexture(ClampLoadBicubic(iPxSample, AddrType(+2, +2), iTextureSize)); \ + Samples.fColor03 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(-1, +2), iTextureSize))); \ + Samples.fColor13 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +2), iTextureSize))); \ + Samples.fColor23 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +2), iTextureSize))); \ + Samples.fColor33 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+2, +2), iTextureSize))); \ \ return Samples; \ } #define DeclareCustomFetchBicubicSamples(Name, LoadTexture) \ - DeclareCustomFetchBicubicSamplesWithType(FetchedBicubicSamples, FfxInt32x2, Name, LoadTexture) + DeclareCustomFetchBicubicSamplesWithType(FetchedBicubicSamples, FfxFloat32x4, FfxInt32x2, Name, LoadTexture) #define DeclareCustomFetchBicubicSamplesMin16(Name, LoadTexture) \ - DeclareCustomFetchBicubicSamplesWithType(FetchedBicubicSamplesMin16, FFX_MIN16_I2, Name, LoadTexture) + DeclareCustomFetchBicubicSamplesWithType(FetchedBicubicSamplesMin16, FFX_MIN16_F4, FfxInt32x2, Name, LoadTexture) -#define DeclareCustomFetchBilinearSamples(Name, LoadTexture) \ - FetchedBilinearSamples Name(FFX_MIN16_I2 iPxSample, FFX_MIN16_I2 iTextureSize) \ +#define DeclareCustomFetchBilinearSamplesWithType(SampleType, TextureType,AddrType, Name, LoadTexture) \ + SampleType Name(AddrType iPxSample, AddrType iTextureSize) \ { \ - FetchedBilinearSamples Samples; \ - Samples.fColor00 = LoadTexture(ClampLoad(iPxSample, FFX_MIN16_I2(+0, +0), iTextureSize)); \ - Samples.fColor10 = LoadTexture(ClampLoad(iPxSample, FFX_MIN16_I2(+1, +0), iTextureSize)); \ - Samples.fColor01 = LoadTexture(ClampLoad(iPxSample, FFX_MIN16_I2(+0, +1), iTextureSize)); \ - Samples.fColor11 = LoadTexture(ClampLoad(iPxSample, FFX_MIN16_I2(+1, +1), iTextureSize)); \ + SampleType Samples; \ + Samples.fColor00 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +0), iTextureSize))); \ + Samples.fColor10 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +0), iTextureSize))); \ + Samples.fColor01 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+0, +1), iTextureSize))); \ + Samples.fColor11 = TextureType(LoadTexture(ClampCoord(iPxSample, AddrType(+1, +1), iTextureSize))); \ return Samples; \ } +#define DeclareCustomFetchBilinearSamples(Name, LoadTexture) \ + DeclareCustomFetchBilinearSamplesWithType(FetchedBilinearSamples, FfxFloat32x4, FfxInt32x2, Name, LoadTexture) + +#define DeclareCustomFetchBilinearSamplesMin16(Name, LoadTexture) \ + DeclareCustomFetchBilinearSamplesWithType(FetchedBilinearSamplesMin16, FFX_MIN16_F4, FfxInt32x2, Name, LoadTexture) + // BE CAREFUL: there is some precision issues and (3253, 125) leading to (3252.9989778, 125.001102) // is common, so iPxSample can "jitter" #define DeclareCustomTextureSample(Name, InterpolateSamples, FetchSamples) \ FfxFloat32x4 Name(FfxFloat32x2 fUvSample, FfxInt32x2 iTextureSize) \ { \ - FfxFloat32x2 fPxSample = fUvSample * FfxFloat32x2(iTextureSize) - FFX_BROADCAST_FLOAT32X2(0.5f); \ + FfxFloat32x2 fPxSample = fUvSample * FfxFloat32x2(iTextureSize) - FfxFloat32x2(0.5f, 0.5f); \ FfxInt32x2 iPxSample = FfxInt32x2(floor(fPxSample)); \ FfxFloat32x2 fPxFrac = ffxFract(fPxSample); \ - FfxFloat32x4 fColorXY = FfxFloat32x4(InterpolateSamples(FetchSamples(FFX_MIN16_I2(iPxSample), FFX_MIN16_I2(iTextureSize)), FFX_MIN16_F2(fPxFrac))); \ + FfxFloat32x4 fColorXY = FfxFloat32x4(InterpolateSamples(FetchSamples(iPxSample, iTextureSize), fPxFrac)); \ return fColorXY; \ } #define DeclareCustomTextureSampleMin16(Name, InterpolateSamples, FetchSamples) \ - FfxFloat32x4 Name(FfxFloat32x2 fUvSample, FfxInt32x2 iTextureSize) \ + FFX_MIN16_F4 Name(FfxFloat32x2 fUvSample, FfxInt32x2 iTextureSize) \ { \ - FfxFloat32x2 fPxSample = fUvSample * FfxFloat32x2(iTextureSize) - FFX_BROADCAST_FLOAT32X2(0.5f); \ + FfxFloat32x2 fPxSample = fUvSample * FfxFloat32x2(iTextureSize) - FfxFloat32x2(0.5f, 0.5f); \ FfxInt32x2 iPxSample = FfxInt32x2(floor(fPxSample)); \ - FfxFloat32x2 fPxFrac = ffxFract(fPxSample); \ - FfxFloat32x4 fColorXY = FfxFloat32x4(InterpolateSamples(FetchSamples(FFX_MIN16_I2(iPxSample), FFX_MIN16_I2(iTextureSize)), FFX_MIN16_F2(fPxFrac))); \ + FFX_MIN16_F2 fPxFrac = FFX_MIN16_F2(ffxFract(fPxSample)); \ + FFX_MIN16_F4 fColorXY = FFX_MIN16_F4(InterpolateSamples(FetchSamples(iPxSample, iTextureSize), fPxFrac)); \ return fColorXY; \ } +#define FFX_FSR2_CONCAT_ID(x, y) x ## y +#define FFX_FSR2_CONCAT(x, y) FFX_FSR2_CONCAT_ID(x, y) +#define FFX_FSR2_SAMPLER_1D_0 Lanczos2 +#define FFX_FSR2_SAMPLER_1D_1 Lanczos2LUT +#define FFX_FSR2_SAMPLER_1D_2 Lanczos2Approx + +#define FFX_FSR2_GET_LANCZOS_SAMPLER1D(x) FFX_FSR2_CONCAT(FFX_FSR2_SAMPLER_1D_, x) + #endif //!defined( FFX_FSR2_SAMPLE_H ) diff --git a/src/ffx-fsr2-api/shaders/ffx_fsr2_upsample.h b/src/ffx-fsr2-api/shaders/ffx_fsr2_upsample.h index 0b83d6a..80524d4 100644 --- a/src/ffx-fsr2-api/shaders/ffx_fsr2_upsample.h +++ b/src/ffx-fsr2-api/shaders/ffx_fsr2_upsample.h @@ -22,54 +22,100 @@ #ifndef FFX_FSR2_UPSAMPLE_H #define FFX_FSR2_UPSAMPLE_H -FfxFloat32 SmoothStep(FfxFloat32 x, FfxFloat32 a, FfxFloat32 b) -{ - x = clamp((x - a) / (b - a), 0.f, 1.f); - return x * x * (3.f - 2.f * x); -} +#define FFX_FSR2_OPTION_GUARANTEE_POSITIVE_UPSAMPLE_WEIGHT 0 FFX_STATIC const FfxUInt32 iLanczos2SampleCount = 16; -void DeringingWithMinMax(UPSAMPLE_F3 fDeringingMin, UPSAMPLE_F3 fDeringingMax, FFX_PARAMETER_INOUT UPSAMPLE_F3 fColor, FFX_PARAMETER_OUT FfxFloat32 fRangeSimilarity) -{ - fRangeSimilarity = fDeringingMin.x / fDeringingMax.x; - fColor = clamp(fColor, fDeringingMin, fDeringingMax); -} - -void Deringing(RectificationBoxData clippingBox, FFX_PARAMETER_INOUT UPSAMPLE_F3 fColor) +void Deringing(RectificationBoxData clippingBox, FFX_PARAMETER_INOUT FfxFloat32x3 fColor) { fColor = clamp(fColor, clippingBox.aabbMin, clippingBox.aabbMax); } - -UPSAMPLE_F GetUpsampleLanczosWeight(UPSAMPLE_F2 fSrcSampleOffset, UPSAMPLE_F2 fKernelWeight) +#if FFX_HALF +void Deringing(RectificationBoxDataMin16 clippingBox, FFX_PARAMETER_INOUT FFX_MIN16_F3 fColor) { - UPSAMPLE_F2 fSrcSampleOffsetBiased = UPSAMPLE_F2(fSrcSampleOffset * fKernelWeight); - UPSAMPLE_F fSampleWeight = Lanczos2ApproxSq(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased)); // TODO: check other distances (l0, l1, linf...) + fColor = clamp(fColor, clippingBox.aabbMin, clippingBox.aabbMax); +} +#endif +#ifndef FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE +#define FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE 1 // Approximate +#endif + +FfxFloat32 GetUpsampleLanczosWeight(FfxFloat32x2 fSrcSampleOffset, FfxFloat32x2 fKernelWeight) +{ + FfxFloat32x2 fSrcSampleOffsetBiased = fSrcSampleOffset * fKernelWeight; +#if FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 0 // LANCZOS_TYPE_REFERENCE + FfxFloat32 fSampleWeight = Lanczos2(length(fSrcSampleOffsetBiased)); +#elif FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 1 // LANCZOS_TYPE_LUT + FfxFloat32 fSampleWeight = Lanczos2_UseLUT(length(fSrcSampleOffsetBiased)); +#elif FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_APPROXIMATE + FfxFloat32 fSampleWeight = Lanczos2ApproxSq(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased)); +#else +#error "Invalid Lanczos type" +#endif return fSampleWeight; } -UPSAMPLE_F Pow3(UPSAMPLE_F x) +#if FFX_HALF +FFX_MIN16_F GetUpsampleLanczosWeight(FFX_MIN16_F2 fSrcSampleOffset, FFX_MIN16_F2 fKernelWeight) +{ + FFX_MIN16_F2 fSrcSampleOffsetBiased = fSrcSampleOffset * fKernelWeight; +#if FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 0 // LANCZOS_TYPE_REFERENCE + FFX_MIN16_F fSampleWeight = Lanczos2(length(fSrcSampleOffsetBiased)); +#elif FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 1 // LANCZOS_TYPE_APPROXIMATE + FFX_MIN16_F fSampleWeight = Lanczos2ApproxSq(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased)); +#elif FFX_FSR2_OPTION_UPSAMPLE_USE_LANCZOS_TYPE == 2 // LANCZOS_TYPE_LUT + FFX_MIN16_F fSampleWeight = Lanczos2_UseLUT(length(fSrcSampleOffsetBiased)); + // To Test: Save reciproqual sqrt compute + // FfxFloat32 fSampleWeight = Lanczos2Sq_UseLUT(dot(fSrcSampleOffsetBiased, fSrcSampleOffsetBiased)); +#else +#error "Invalid Lanczos type" +#endif + return fSampleWeight; +} +#endif + +FfxFloat32 Pow3(FfxFloat32 x) { return x * x * x; } -UPSAMPLE_F4 ComputeUpsampledColorAndWeight(FFX_MIN16_I2 iPxHrPos, UPSAMPLE_F2 fKernelWeight, FFX_PARAMETER_INOUT RectificationBoxData clippingBox) +#if FX_HALF +FFX_MIN16_F Pow3(FFX_MIN16_F x) { + return x * x * x; +} +#endif + +FfxFloat32x4 ComputeUpsampledColorAndWeight(FfxInt32x2 iPxHrPos, FfxFloat32x2 fKernelWeight, FFX_PARAMETER_INOUT RectificationBoxData clippingBox) +{ +#if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF +#include "ffx_fsr2_force16_begin.h" +#endif // We compute a sliced lanczos filter with 2 lobes (other slices are accumulated temporaly) FfxFloat32x2 fDstOutputPos = FfxFloat32x2(iPxHrPos) + FFX_BROADCAST_FLOAT32X2(0.5f); // Destination resolution output pixel center position FfxFloat32x2 fSrcOutputPos = fDstOutputPos * DownscaleFactor(); // Source resolution output pixel center position FfxInt32x2 iSrcInputPos = FfxInt32x2(floor(fSrcOutputPos)); // TODO: what about weird upscale factors... - UPSAMPLE_F3 fSamples[iLanczos2SampleCount]; - - FfxFloat32x2 fSrcUnjitteredPos = (FfxFloat32x2(iSrcInputPos) + FFX_BROADCAST_FLOAT32X2(0.5f)) - Jitter(); // This is the un-jittered position of the sample at offset 0,0 - - UPSAMPLE_I2 offsetTL; - offsetTL.x = (fSrcUnjitteredPos.x > fSrcOutputPos.x) ? UPSAMPLE_I(-2) : UPSAMPLE_I(-1); - offsetTL.y = (fSrcUnjitteredPos.y > fSrcOutputPos.y) ? UPSAMPLE_I(-2) : UPSAMPLE_I(-1); +#if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF +#include "ffx_fsr2_force16_end.h" +#endif +#if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF +#include "ffx_fsr2_force16_begin.h" + RectificationBoxMin16 fRectificationBox; +#else RectificationBox fRectificationBox; +#endif + + FfxFloat32x3 fSamples[iLanczos2SampleCount]; + + + FfxFloat32x2 fSrcUnjitteredPos = (FfxFloat32x2(iSrcInputPos) + FfxFloat32x2(0.5f, 0.5f)) - Jitter(); // This is the un-jittered position of the sample at offset 0,0 + + FfxInt32x2 offsetTL; + offsetTL.x = (fSrcUnjitteredPos.x > fSrcOutputPos.x) ? FfxInt32(-2) : FfxInt32(-1); + offsetTL.y = (fSrcUnjitteredPos.y > fSrcOutputPos.y) ? FfxInt32(-2) : FfxInt32(-1); //Load samples // If fSrcUnjitteredPos.y > fSrcOutputPos.y, indicates offsetTL.y = -2, sample offset Y will be [-2, 1], clipbox will be rows [1, 3]. @@ -78,7 +124,7 @@ UPSAMPLE_F4 ComputeUpsampledColorAndWeight(FFX_MIN16_I2 iPxHrPos, UPSAMPLE_F2 fK const FfxBoolean bFlipRow = fSrcUnjitteredPos.y > fSrcOutputPos.y; const FfxBoolean bFlipCol = fSrcUnjitteredPos.x > fSrcOutputPos.x; - UPSAMPLE_F2 fOffsetTL = UPSAMPLE_F2(offsetTL); + FfxFloat32x2 fOffsetTL = FfxFloat32x2(offsetTL); FFX_UNROLL for (FfxInt32 row = 0; row < 4; row++) { @@ -92,57 +138,77 @@ UPSAMPLE_F4 ComputeUpsampledColorAndWeight(FFX_MIN16_I2 iPxHrPos, UPSAMPLE_F2 fK const FfxInt32x2 sampleCoord = ClampLoad(iSrcSamplePos, FfxInt32x2(0, 0), FfxInt32x2(RenderSize())); - fSamples[iSampleIndex] = LoadPreparedInputColor(FFX_MIN16_I2(sampleCoord)); + fSamples[iSampleIndex] = LoadPreparedInputColor(FfxInt32x2(sampleCoord)); } } RectificationBoxReset(fRectificationBox, fSamples[0]); - UPSAMPLE_F3 fColor = UPSAMPLE_F3(0.f, 0.f, 0.f); - UPSAMPLE_F fWeight = UPSAMPLE_F(0.f); - UPSAMPLE_F2 fBaseSampleOffset = UPSAMPLE_F2(fSrcUnjitteredPos - fSrcOutputPos); + FfxFloat32x3 fColor = FfxFloat32x3(0.f, 0.f, 0.f); + FfxFloat32 fWeight = FfxFloat32(0.f); + FfxFloat32x2 fBaseSampleOffset = FfxFloat32x2(fSrcUnjitteredPos - fSrcOutputPos); FFX_UNROLL - for (FfxUInt32 iSampleIndex = 0; iSampleIndex < iLanczos2SampleCount; ++iSampleIndex) - { - FfxInt32 row = FfxInt32(iSampleIndex >> 2); - FfxInt32 col = FfxInt32(iSampleIndex & 3); + for (FfxInt32 row = 0; row < 3; row++) { - const FfxInt32x2 sampleColRow = FfxInt32x2(bFlipCol ? (3 - col) : col, bFlipRow ? (3 - row) : row); - const UPSAMPLE_F2 fOffset = fOffsetTL + UPSAMPLE_F2(sampleColRow); - UPSAMPLE_F2 fSrcSampleOffset = fBaseSampleOffset + fOffset; + FFX_UNROLL + for (FfxInt32 col = 0; col < 3; col++) { + FfxInt32 iSampleIndex = col + (row << 2); - FfxInt32x2 iSrcSamplePos = FfxInt32x2(iSrcInputPos) + FfxInt32x2(offsetTL) + sampleColRow; + const FfxInt32x2 sampleColRow = FfxInt32x2(bFlipCol ? (3 - col) : col, bFlipRow ? (3 - row) : row); + const FfxFloat32x2 fOffset = fOffsetTL + FfxFloat32x2(sampleColRow); + FfxFloat32x2 fSrcSampleOffset = fBaseSampleOffset + fOffset; - UPSAMPLE_F fSampleWeight = UPSAMPLE_F(IsOnScreen(FFX_MIN16_I2(iSrcSamplePos), FFX_MIN16_I2(RenderSize()))) * GetUpsampleLanczosWeight(fSrcSampleOffset, fKernelWeight); + FfxInt32x2 iSrcSamplePos = FfxInt32x2(iSrcInputPos) + FfxInt32x2(offsetTL) + sampleColRow; - // Update rectification box - if(all(FFX_LESS_THAN(FfxInt32x2(col, row), FFX_BROADCAST_INT32X2(3)))) - { - //update clipping box in non-locked areas - const UPSAMPLE_F fSrcSampleOffsetSq = dot(fSrcSampleOffset, fSrcSampleOffset); - UPSAMPLE_F fBoxSampleWeight = UPSAMPLE_F(1) - ffxSaturate(fSrcSampleOffsetSq / UPSAMPLE_F(3)); + FfxFloat32 fSampleWeight = FfxFloat32(IsOnScreen(FfxInt32x2(iSrcSamplePos), FfxInt32x2(RenderSize()))) * GetUpsampleLanczosWeight(fSrcSampleOffset, fKernelWeight); + + // Update rectification box + const FfxFloat32 fSrcSampleOffsetSq = dot(fSrcSampleOffset, fSrcSampleOffset); + FfxFloat32 fBoxSampleWeight = FfxFloat32(1) - ffxSaturate(fSrcSampleOffsetSq / FfxFloat32(3)); fBoxSampleWeight *= fBoxSampleWeight; RectificationBoxAddSample(fRectificationBox, fSamples[iSampleIndex], fBoxSampleWeight); + + fWeight += fSampleWeight; + fColor += fSampleWeight * fSamples[iSampleIndex]; } - - fWeight += fSampleWeight; - fColor += fSampleWeight * fSamples[iSampleIndex]; } - // Normalize for deringing (we need to compare colors) - fColor = fColor / (abs(fWeight) > FSR2_EPSILON ? fWeight : UPSAMPLE_F(1.f)); + fColor = fColor / (abs(fWeight) > FSR2_EPSILON ? fWeight : FfxFloat32(1.f)); RectificationBoxComputeVarianceBoxData(fRectificationBox); - clippingBox = RectificationBoxGetData(fRectificationBox); +#if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF + RectificationBoxDataMin16 rectificationData = RectificationBoxGetData(fRectificationBox); + clippingBox.aabbMax = rectificationData.aabbMax; + clippingBox.aabbMin = rectificationData.aabbMin; + clippingBox.boxCenter = rectificationData.boxCenter; + clippingBox.boxVec = rectificationData.boxVec; +#else + RectificationBoxData rectificationData = RectificationBoxGetData(fRectificationBox); + clippingBox = rectificationData; +#endif - Deringing(RectificationBoxGetData(fRectificationBox), fColor); + Deringing(rectificationData, fColor); - if (any(FFX_LESS_THAN(fKernelWeight, UPSAMPLE_F2_BROADCAST(1.0f)))) { - fWeight = UPSAMPLE_F(averageLanczosWeightPerFrame); +#if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF + clippingBox.aabbMax = rectificationData.aabbMax; + clippingBox.aabbMin = rectificationData.aabbMin; + clippingBox.boxCenter = rectificationData.boxCenter; + clippingBox.boxVec = rectificationData.boxVec; +#endif + + if (any(FFX_LESS_THAN(fKernelWeight, FfxFloat32x2(1, 1)))) { + fWeight = FfxFloat32(averageLanczosWeightPerFrame); } +#if FFX_FSR2_OPTION_UPSAMPLE_SAMPLERS_USE_DATA_HALF && FFX_HALF +#include "ffx_fsr2_force16_end.h" +#endif - return UPSAMPLE_F4(fColor, ffxMax(UPSAMPLE_F(0), fWeight)); +#if FFX_FSR2_OPTION_GUARANTEE_POSITIVE_UPSAMPLE_WEIGHT + return FfxFloat32x4(fColor, ffxMax(FfxFloat32(FSR2_EPSILON), fWeight)); +#else + return FfxFloat32x4(fColor, ffxMax(FfxFloat32(0), fWeight)); +#endif } #endif //!defined( FFX_FSR2_UPSAMPLE_H ) diff --git a/src/ffx-fsr2-api/vk/CMakeLists.txt b/src/ffx-fsr2-api/vk/CMakeLists.txt index 23aec21..933d097 100644 --- a/src/ffx-fsr2-api/vk/CMakeLists.txt +++ b/src/ffx-fsr2-api/vk/CMakeLists.txt @@ -107,4 +107,4 @@ add_custom_target(shader_permutations_vk DEPENDS ${PERMUTATION_OUTPUTS}) add_dependencies(${FFX_SC_DEPENDENT_TARGET} shader_permutations_vk) source_group("source" FILES ${VK}) -source_group("shaders" FILES ${SHADERS}) +source_group("shaders" FILES ${SHADERS}) \ No newline at end of file diff --git a/src/ffx-fsr2-api/vk/ffx_fsr2_vk.cpp b/src/ffx-fsr2-api/vk/ffx_fsr2_vk.cpp index f984d86..867da60 100644 --- a/src/ffx-fsr2-api/vk/ffx_fsr2_vk.cpp +++ b/src/ffx-fsr2-api/vk/ffx_fsr2_vk.cpp @@ -26,11 +26,12 @@ #include #include #include +#include // prototypes for functions in the interface FfxErrorCode GetDeviceCapabilitiesVK(FfxFsr2Interface* backendInterface, FfxDeviceCapabilities* deviceCapabilities, FfxDevice device); -FfxErrorCode CreateDeviceVK(FfxFsr2Interface* backendInterface, FfxDevice device); -FfxErrorCode DestroyDeviceVK(FfxFsr2Interface* backendInterface, FfxDevice device); +FfxErrorCode CreateBackendContextVK(FfxFsr2Interface* backendInterface, FfxDevice device); +FfxErrorCode DestroyBackendContextVK(FfxFsr2Interface* backendInterface); FfxErrorCode CreateResourceVK(FfxFsr2Interface* backendInterface, const FfxCreateResourceDescription* desc, FfxResourceInternal* outResource); FfxErrorCode RegisterResourceVK(FfxFsr2Interface* backendInterface, const FfxResource* inResource, FfxResourceInternal* outResourceInternal); FfxErrorCode UnregisterResourcesVK(FfxFsr2Interface* backendInterface); @@ -38,14 +39,14 @@ FfxResourceDescription GetResourceDescriptorVK(FfxFsr2Interface* backendInterfac FfxErrorCode DestroyResourceVK(FfxFsr2Interface* backendInterface, FfxResourceInternal resource); FfxErrorCode CreatePipelineVK(FfxFsr2Interface* backendInterface, FfxFsr2Pass passId, const FfxPipelineDescription* desc, FfxPipelineState* outPass); FfxErrorCode DestroyPipelineVK(FfxFsr2Interface* backendInterface, FfxPipelineState* pipeline); -FfxErrorCode ScheduleRenderJobVK(FfxFsr2Interface* backendInterface, const FfxRenderJobDescription* job); -FfxErrorCode ExecuteRenderJobsVK(FfxFsr2Interface* backendInterface, FfxCommandList commandList); +FfxErrorCode ScheduleGpuJobVK(FfxFsr2Interface* backendInterface, const FfxGpuJobDescription* job); +FfxErrorCode ExecuteGpuJobsVK(FfxFsr2Interface* backendInterface, FfxCommandList commandList); #define FSR2_MAX_QUEUED_FRAMES ( 4) #define FSR2_MAX_RESOURCE_COUNT (64) #define FSR2_MAX_STAGING_RESOURCE_COUNT ( 8) #define FSR2_MAX_BARRIERS (16) -#define FSR2_MAX_RENDERJOBS (32) +#define FSR2_MAX_GPU_JOBS (32) #define FSR2_MAX_IMAGE_COPY_MIPS (32) #define FSR2_MAX_SAMPLERS ( 2) #define FSR2_MAX_UNIFORM_BUFFERS ( 4) @@ -135,8 +136,8 @@ typedef struct BackendContext_VK { VkDevice device = nullptr; VkFunctionTable vkFunctionTable = {}; - uint32_t renderJobCount = 0; - FfxRenderJobDescription renderJobs[FSR2_MAX_RENDERJOBS] = {}; + uint32_t gpuJobCount = 0; + FfxGpuJobDescription gpuJobs[FSR2_MAX_GPU_JOBS] = {}; uint32_t nextStaticResource = 0; uint32_t nextDynamicResource = 0; @@ -197,8 +198,8 @@ FfxErrorCode ffxFsr2GetInterfaceVK( FFX_ERROR_INSUFFICIENT_MEMORY); outInterface->fpGetDeviceCapabilities = GetDeviceCapabilitiesVK; - outInterface->fpCreateDevice = CreateDeviceVK; - outInterface->fpDestroyDevice = DestroyDeviceVK; + outInterface->fpCreateBackendContext = CreateBackendContextVK; + outInterface->fpDestroyBackendContext = DestroyBackendContextVK; outInterface->fpCreateResource = CreateResourceVK; outInterface->fpRegisterResource = RegisterResourceVK; outInterface->fpUnregisterResources = UnregisterResourcesVK; @@ -206,8 +207,8 @@ FfxErrorCode ffxFsr2GetInterfaceVK( outInterface->fpDestroyResource = DestroyResourceVK; outInterface->fpCreatePipeline = CreatePipelineVK; outInterface->fpDestroyPipeline = DestroyPipelineVK; - outInterface->fpScheduleRenderJob = ScheduleRenderJobVK; - outInterface->fpExecuteRenderJobs = ExecuteRenderJobsVK; + outInterface->fpScheduleGpuJob = ScheduleGpuJobVK; + outInterface->fpExecuteGpuJobs = ExecuteGpuJobsVK; outInterface->scratchBuffer = scratchBuffer; outInterface->scratchBufferSize = scratchBufferSize; @@ -305,6 +306,8 @@ VkFormat getVKFormatFromSurfaceFormat(FfxSurfaceFormat fmt) return VK_FORMAT_R16_SNORM; case(FFX_SURFACE_FORMAT_R8_UNORM): return VK_FORMAT_R8_UNORM; + case(FFX_SURFACE_FORMAT_R8G8_UNORM): + return VK_FORMAT_R8G8_UNORM; case(FFX_SURFACE_FORMAT_R32_FLOAT): return VK_FORMAT_R32_SFLOAT; default: @@ -749,7 +752,7 @@ FfxErrorCode GetDeviceCapabilitiesVK(FfxFsr2Interface* backendInterface, FfxDevi return FFX_OK; } -FfxErrorCode CreateDeviceVK(FfxFsr2Interface* backendInterface, FfxDevice device) +FfxErrorCode CreateBackendContextVK(FfxFsr2Interface* backendInterface, FfxDevice device) { FFX_ASSERT(NULL != backendInterface); @@ -926,7 +929,7 @@ FfxErrorCode CreateDeviceVK(FfxFsr2Interface* backendInterface, FfxDevice device } } - backendContext->renderJobCount = 0; + backendContext->gpuJobCount = 0; backendContext->scheduledImageBarrierCount = 0; backendContext->scheduledBufferBarrierCount = 0; backendContext->stagingResourceCount = 0; @@ -938,7 +941,7 @@ FfxErrorCode CreateDeviceVK(FfxFsr2Interface* backendInterface, FfxDevice device return FFX_OK; } -FfxErrorCode DestroyDeviceVK(FfxFsr2Interface* backendInterface, FfxDevice device) +FfxErrorCode DestroyBackendContextVK(FfxFsr2Interface* backendInterface) { FFX_ASSERT(NULL != backendInterface); @@ -973,8 +976,7 @@ FfxErrorCode DestroyDeviceVK(FfxFsr2Interface* backendInterface, FfxDevice devic backendContext->pointSampler = nullptr; backendContext->linearSampler = nullptr; - VkDevice vkDevice = reinterpret_cast(device); - if (vkDevice != nullptr) { + if (backendContext->device != nullptr) { backendContext->device = nullptr; } @@ -992,8 +994,8 @@ FfxErrorCode CreateResourceVK( FFX_ASSERT(NULL != createResourceDescription); FFX_ASSERT(NULL != outResource); - VkDevice vkDevice = reinterpret_cast(createResourceDescription->device); BackendContext_VK* backendContext = (BackendContext_VK*)backendInterface->scratchBuffer; + VkDevice vkDevice = reinterpret_cast(backendContext->device); FFX_ASSERT(backendContext->nextStaticResource + 1 < backendContext->nextDynamicResource); outResource->internalIndex = backendContext->nextStaticResource++; @@ -1196,14 +1198,14 @@ FfxErrorCode CreateResourceVK( backendInterface->fpCreateResource(backendInterface, &uploadDesc, ©Src); // setup the upload job - FfxRenderJobDescription copyJob = + FfxGpuJobDescription copyJob = { - FFX_RENDER_JOB_COPY + FFX_GPU_JOB_COPY }; copyJob.copyJobDescriptor.src = copySrc; copyJob.copyJobDescriptor.dst = *outResource; - backendInterface->fpScheduleRenderJob(backendInterface, ©Job); + backendInterface->fpScheduleGpuJob(backendInterface, ©Job); // add to the list of staging resources to delete later uint32_t stagingResIdx = backendContext->stagingResourceCount++; @@ -1277,12 +1279,11 @@ FfxErrorCode CreatePipelineVK(FfxFsr2Interface* backendInterface, FfxFsr2Pass pa flags |= (pipelineDescription->contextFlags & FFX_FSR2_ENABLE_MOTION_VECTORS_JITTER_CANCELLATION) ? FSR2_SHADER_PERMUTATION_JITTER_MOTION_VECTORS : 0; flags |= (pipelineDescription->contextFlags & FFX_FSR2_ENABLE_DEPTH_INVERTED) ? FSR2_SHADER_PERMUTATION_DEPTH_INVERTED : 0; flags |= (pass == FFX_FSR2_PASS_ACCUMULATE_SHARPEN) ? FSR2_SHADER_PERMUTATION_ENABLE_SHARPENING : 0; - flags |= (useLut) ? FSR2_SHADER_PERMUTATION_LANCZOS_LUT : 0; + flags |= (useLut) ? FSR2_SHADER_PERMUTATION_REPROJECT_USE_LANCZOS_TYPE : 0; flags |= (canForceWave64) ? FSR2_SHADER_PERMUTATION_FORCE_WAVE64 : 0; - flags |= (supportedFP16) ? FSR2_SHADER_PERMUTATION_ALLOW_FP16 : 0; + flags |= (supportedFP16 && (pass != FFX_FSR2_PASS_RCAS)) ? FSR2_SHADER_PERMUTATION_ALLOW_FP16 : 0; - Fsr2ShaderBlobVK shaderBlob = {}; - fsr2GetPermutationBlobByIndex(pass, flags, &shaderBlob); + const Fsr2ShaderBlobVK shaderBlob = fsr2GetPermutationBlobByIndex(pass, flags); FFX_ASSERT(shaderBlob.data && shaderBlob.size); // populate the pass. @@ -1292,21 +1293,22 @@ FfxErrorCode CreatePipelineVK(FfxFsr2Interface* backendInterface, FfxFsr2Pass pa FFX_ASSERT(shaderBlob.storageImageCount < FFX_MAX_NUM_UAVS); FFX_ASSERT(shaderBlob.sampledImageCount < FFX_MAX_NUM_SRVS); + std::wstring_convert> converter; for (uint32_t srvIndex = 0; srvIndex < outPipeline->srvCount; ++srvIndex) { outPipeline->srvResourceBindings[srvIndex].slotIndex = shaderBlob.boundSampledImageBindings[srvIndex]; - strcpy_s(outPipeline->srvResourceBindings[srvIndex].name, shaderBlob.boundSampledImageNames[srvIndex]); + wcscpy_s(outPipeline->srvResourceBindings[srvIndex].name, converter.from_bytes(shaderBlob.boundSampledImageNames[srvIndex]).c_str()); } for (uint32_t uavIndex = 0; uavIndex < outPipeline->uavCount; ++uavIndex) { outPipeline->uavResourceBindings[uavIndex].slotIndex = shaderBlob.boundStorageImageBindings[uavIndex]; - strcpy_s(outPipeline->uavResourceBindings[uavIndex].name, shaderBlob.boundStorageImageNames[uavIndex]); + wcscpy_s(outPipeline->uavResourceBindings[uavIndex].name, converter.from_bytes(shaderBlob.boundStorageImageNames[uavIndex]).c_str()); } for (uint32_t cbIndex = 0; cbIndex < outPipeline->constCount; ++cbIndex) { outPipeline->cbResourceBindings[cbIndex].slotIndex = shaderBlob.boundUniformBufferBindings[cbIndex]; - strcpy_s(outPipeline->cbResourceBindings[cbIndex].name, shaderBlob.boundUniformBufferNames[cbIndex]); + wcscpy_s(outPipeline->cbResourceBindings[cbIndex].name, converter.from_bytes(shaderBlob.boundUniformBufferNames[cbIndex]).c_str()); } // create descriptor set layout @@ -1429,21 +1431,21 @@ FfxErrorCode CreatePipelineVK(FfxFsr2Interface* backendInterface, FfxFsr2Pass pa return FFX_OK; } -FfxErrorCode ScheduleRenderJobVK(FfxFsr2Interface* backendInterface, const FfxRenderJobDescription* job) +FfxErrorCode ScheduleGpuJobVK(FfxFsr2Interface* backendInterface, const FfxGpuJobDescription* job) { FFX_ASSERT(NULL != backendInterface); FFX_ASSERT(NULL != job); BackendContext_VK* backendContext = (BackendContext_VK*)backendInterface->scratchBuffer; - FFX_ASSERT(backendContext->renderJobCount < FSR2_MAX_RENDERJOBS); + FFX_ASSERT(backendContext->gpuJobCount < FSR2_MAX_GPU_JOBS); - backendContext->renderJobs[backendContext->renderJobCount] = *job; + backendContext->gpuJobs[backendContext->gpuJobCount] = *job; - if (job->jobType == FFX_RENDER_JOB_COMPUTE) { + if (job->jobType == FFX_GPU_JOB_COMPUTE) { // needs to copy SRVs and UAVs in case they are on the stack only - FfxComputeJobDescription* computeJob = &backendContext->renderJobs[backendContext->renderJobCount].computeJobDescriptor; + FfxComputeJobDescription* computeJob = &backendContext->gpuJobs[backendContext->gpuJobCount].computeJobDescriptor; const uint32_t numConstBuffers = job->computeJobDescriptor.pipeline.constCount; for (uint32_t currentRootConstantIndex = 0; currentRootConstantIndex < numConstBuffers; ++currentRootConstantIndex) { @@ -1452,7 +1454,7 @@ FfxErrorCode ScheduleRenderJobVK(FfxFsr2Interface* backendInterface, const FfxRe } } - backendContext->renderJobCount++; + backendContext->gpuJobCount++; return FFX_OK; } @@ -1540,7 +1542,7 @@ void flushBarriers(BackendContext_VK* backendContext, VkCommandBuffer vkCommandB } } -static FfxErrorCode executeRenderJobCompute(BackendContext_VK* backendContext, FfxRenderJobDescription* job, VkCommandBuffer vkCommandBuffer) +static FfxErrorCode executeGpuJobCompute(BackendContext_VK* backendContext, FfxGpuJobDescription* job, VkCommandBuffer vkCommandBuffer) { uint32_t imageInfoIndex = 0; uint32_t bufferInfoIndex = 0; @@ -1646,7 +1648,7 @@ static FfxErrorCode executeRenderJobCompute(BackendContext_VK* backendContext, F return FFX_OK; } -static FfxErrorCode executeRenderJobCopy(BackendContext_VK* backendContext, FfxRenderJobDescription* job, VkCommandBuffer vkCommandBuffer) +static FfxErrorCode executeGpuJobCopy(BackendContext_VK* backendContext, FfxGpuJobDescription* job, VkCommandBuffer vkCommandBuffer) { BackendContext_VK::Resource ffxResourceSrc = backendContext->resources[job->copyJobDescriptor.src.internalIndex]; BackendContext_VK::Resource ffxResourceDst = backendContext->resources[job->copyJobDescriptor.dst.internalIndex]; @@ -1745,7 +1747,7 @@ static FfxErrorCode executeRenderJobCopy(BackendContext_VK* backendContext, FfxR return FFX_OK; } -static FfxErrorCode executeRenderJobClearFloat(BackendContext_VK* backendContext, FfxRenderJobDescription* job, VkCommandBuffer vkCommandBuffer) +static FfxErrorCode executeGpuJobClearFloat(BackendContext_VK* backendContext, FfxGpuJobDescription* job, VkCommandBuffer vkCommandBuffer) { uint32_t idx = job->clearJobDescriptor.target.internalIndex; BackendContext_VK::Resource ffxResource = backendContext->resources[idx]; @@ -1777,7 +1779,7 @@ static FfxErrorCode executeRenderJobClearFloat(BackendContext_VK* backendContext return FFX_OK; } -FfxErrorCode ExecuteRenderJobsVK(FfxFsr2Interface* backendInterface, FfxCommandList commandList) +FfxErrorCode ExecuteGpuJobsVK(FfxFsr2Interface* backendInterface, FfxCommandList commandList) { FFX_ASSERT(NULL != backendInterface); @@ -1786,26 +1788,26 @@ FfxErrorCode ExecuteRenderJobsVK(FfxFsr2Interface* backendInterface, FfxCommandL FfxErrorCode errorCode = FFX_OK; // execute all renderjobs - for (uint32_t i = 0; i < backendContext->renderJobCount; ++i) + for (uint32_t i = 0; i < backendContext->gpuJobCount; ++i) { - FfxRenderJobDescription* renderJob = &backendContext->renderJobs[i]; + FfxGpuJobDescription* gpuJob = &backendContext->gpuJobs[i]; VkCommandBuffer vkCommandBuffer = reinterpret_cast(commandList); - switch (renderJob->jobType) + switch (gpuJob->jobType) { - case FFX_RENDER_JOB_CLEAR_FLOAT: + case FFX_GPU_JOB_CLEAR_FLOAT: { - errorCode = executeRenderJobClearFloat(backendContext, renderJob, vkCommandBuffer); + errorCode = executeGpuJobClearFloat(backendContext, gpuJob, vkCommandBuffer); break; } - case FFX_RENDER_JOB_COPY: + case FFX_GPU_JOB_COPY: { - errorCode = executeRenderJobCopy(backendContext, renderJob, vkCommandBuffer); + errorCode = executeGpuJobCopy(backendContext, gpuJob, vkCommandBuffer); break; } - case FFX_RENDER_JOB_COMPUTE: + case FFX_GPU_JOB_COMPUTE: { - errorCode = executeRenderJobCompute(backendContext, renderJob, vkCommandBuffer); + errorCode = executeGpuJobCompute(backendContext, gpuJob, vkCommandBuffer); break; } default:; @@ -1817,7 +1819,7 @@ FfxErrorCode ExecuteRenderJobsVK(FfxFsr2Interface* backendInterface, FfxCommandL errorCode == FFX_OK, FFX_ERROR_BACKEND_API_ERROR); - backendContext->renderJobCount = 0; + backendContext->gpuJobCount = 0; return FFX_OK; } @@ -1911,4 +1913,4 @@ FfxErrorCode DestroyPipelineVK(FfxFsr2Interface* backendInterface, FfxPipelineSt } return FFX_OK; -} +} \ No newline at end of file diff --git a/src/ffx-fsr2-api/vk/ffx_fsr2_vk.h b/src/ffx-fsr2-api/vk/ffx_fsr2_vk.h index c352c98..e0e226a 100644 --- a/src/ffx-fsr2-api/vk/ffx_fsr2_vk.h +++ b/src/ffx-fsr2-api/vk/ffx_fsr2_vk.h @@ -19,7 +19,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. - // @defgroup VK #pragma once @@ -156,4 +155,4 @@ extern "C" { #if defined(__cplusplus) } -#endif // #if defined(__cplusplus) +#endif // #if defined(__cplusplus) \ No newline at end of file diff --git a/src/ffx-fsr2-api/vk/shaders/ffx_fsr2_shaders_vk.cpp b/src/ffx-fsr2-api/vk/shaders/ffx_fsr2_shaders_vk.cpp index 3ecdea1..230ae9b 100644 --- a/src/ffx-fsr2-api/vk/shaders/ffx_fsr2_shaders_vk.cpp +++ b/src/ffx-fsr2-api/vk/shaders/ffx_fsr2_shaders_vk.cpp @@ -29,14 +29,13 @@ #include "ffx_fsr2_prepare_input_color_pass_permutations.h" #include "ffx_fsr2_reconstruct_previous_depth_pass_permutations.h" #include "ffx_fsr2_rcas_pass_permutations.h" -#include #if defined(POPULATE_PERMUTATION_KEY) #undef POPULATE_PERMUTATION_KEY #endif // #if defined(POPULATE_PERMUTATION_KEY) #define POPULATE_PERMUTATION_KEY(options, key) \ key.index = 0; \ -key.FFX_FSR2_OPTION_USE_LANCZOS_LUT = FFX_CONTAINS_FLAG(options, FSR2_SHADER_PERMUTATION_LANCZOS_LUT); \ +key.FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE = FFX_CONTAINS_FLAG(options, FSR2_SHADER_PERMUTATION_REPROJECT_USE_LANCZOS_TYPE); \ key.FFX_FSR2_OPTION_HDR_COLOR_INPUT = FFX_CONTAINS_FLAG(options, FSR2_SHADER_PERMUTATION_HDR_COLOR_INPUT); \ key.FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS = FFX_CONTAINS_FLAG(options, FSR2_SHADER_PERMUTATION_LOW_RES_MOTION_VECTORS); \ key.FFX_FSR2_OPTION_JITTERED_MOTION_VECTORS = FFX_CONTAINS_FLAG(options, FSR2_SHADER_PERMUTATION_JITTER_MOTION_VECTORS); \ @@ -114,7 +113,7 @@ Fsr2ShaderBlobVK fsr2GetComputeLuminancePyramidPassPermutationBlobByIndex(uint32 ffx_fsr2_compute_luminance_pyramid_pass_PermutationKey key; key.index = 0; - key.FFX_FSR2_OPTION_USE_LANCZOS_LUT = FFX_CONTAINS_FLAG(permutationOptions, FSR2_SHADER_PERMUTATION_LANCZOS_LUT); + key.FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE = FFX_CONTAINS_FLAG(permutationOptions, FSR2_SHADER_PERMUTATION_REPROJECT_USE_LANCZOS_TYPE); key.FFX_FSR2_OPTION_HDR_COLOR_INPUT = FFX_CONTAINS_FLAG(permutationOptions, FSR2_SHADER_PERMUTATION_HDR_COLOR_INPUT); key.FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS = FFX_CONTAINS_FLAG(permutationOptions, FSR2_SHADER_PERMUTATION_LOW_RES_MOTION_VECTORS); key.FFX_FSR2_OPTION_JITTERED_MOTION_VECTORS = FFX_CONTAINS_FLAG(permutationOptions, FSR2_SHADER_PERMUTATION_JITTER_MOTION_VECTORS); @@ -135,73 +134,33 @@ Fsr2ShaderBlobVK fsr2GetAutogenReactivePassPermutationBlobByIndex(uint32_t permu return POPULATE_SHADER_BLOB(g_ffx_fsr2_autogen_reactive_pass_PermutationInfo, tableIndex); } -FfxErrorCode fsr2GetPermutationBlobByIndex(FfxFsr2Pass passId, uint32_t permutationOptions, Fsr2ShaderBlobVK* outBlob) +Fsr2ShaderBlobVK fsr2GetPermutationBlobByIndex(FfxFsr2Pass passId, uint32_t permutationOptions) { switch (passId) { case FFX_FSR2_PASS_PREPARE_INPUT_COLOR: - { - Fsr2ShaderBlobVK blob = fsr2GetPrepareInputColorPassPermutationBlobByIndex(permutationOptions); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobVK)); - return FFX_OK; - } - + return fsr2GetPrepareInputColorPassPermutationBlobByIndex(permutationOptions); case FFX_FSR2_PASS_DEPTH_CLIP: - { - Fsr2ShaderBlobVK blob = fsr2GetDepthClipPassPermutationBlobByIndex(permutationOptions); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobVK)); - return FFX_OK; - } - + return fsr2GetDepthClipPassPermutationBlobByIndex(permutationOptions); case FFX_FSR2_PASS_RECONSTRUCT_PREVIOUS_DEPTH: - { - Fsr2ShaderBlobVK blob = fsr2GetReconstructPreviousDepthPassPermutationBlobByIndex(permutationOptions); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobVK)); - return FFX_OK; - } - + return fsr2GetReconstructPreviousDepthPassPermutationBlobByIndex(permutationOptions); case FFX_FSR2_PASS_LOCK: - { - Fsr2ShaderBlobVK blob = fsr2GetLockPassPermutationBlobByIndex(permutationOptions); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobVK)); - return FFX_OK; - } - + return fsr2GetLockPassPermutationBlobByIndex(permutationOptions); case FFX_FSR2_PASS_ACCUMULATE: case FFX_FSR2_PASS_ACCUMULATE_SHARPEN: - { - Fsr2ShaderBlobVK blob = fsr2GetAccumulatePassPermutationBlobByIndex(permutationOptions); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobVK)); - return FFX_OK; - } - + return fsr2GetAccumulatePassPermutationBlobByIndex(permutationOptions); case FFX_FSR2_PASS_RCAS: - { - Fsr2ShaderBlobVK blob = fsr2GetRCASPassPermutationBlobByIndex(permutationOptions); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobVK)); - return FFX_OK; - } - + return fsr2GetRCASPassPermutationBlobByIndex(permutationOptions); case FFX_FSR2_PASS_COMPUTE_LUMINANCE_PYRAMID: - { - Fsr2ShaderBlobVK blob = fsr2GetComputeLuminancePyramidPassPermutationBlobByIndex(permutationOptions); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobVK)); - return FFX_OK; - } - + return fsr2GetComputeLuminancePyramidPassPermutationBlobByIndex(permutationOptions); case FFX_FSR2_PASS_GENERATE_REACTIVE: - { - Fsr2ShaderBlobVK blob = fsr2GetAutogenReactivePassPermutationBlobByIndex(permutationOptions); - memcpy(outBlob, &blob, sizeof(Fsr2ShaderBlobVK)); - return FFX_OK; - } - + return fsr2GetAutogenReactivePassPermutationBlobByIndex(permutationOptions); default: FFX_ASSERT_FAIL("Should never reach here."); break; } // return an empty blob - memset(outBlob, 0, sizeof(Fsr2ShaderBlobVK)); - return FFX_OK; -} + Fsr2ShaderBlobVK emptyBlob = {}; + return emptyBlob; +} \ No newline at end of file diff --git a/src/ffx-fsr2-api/vk/shaders/ffx_fsr2_shaders_vk.h b/src/ffx-fsr2-api/vk/shaders/ffx_fsr2_shaders_vk.h index 9cadc31..da581c7 100644 --- a/src/ffx-fsr2-api/vk/shaders/ffx_fsr2_shaders_vk.h +++ b/src/ffx-fsr2-api/vk/shaders/ffx_fsr2_shaders_vk.h @@ -28,37 +28,37 @@ extern "C" { #endif // #if defined(__cplusplus) -// A single shader blob and a description of its resources. -typedef struct Fsr2ShaderBlobVK { + // A single shader blob and a description of its resources. + typedef struct Fsr2ShaderBlobVK { - const uint8_t* data; // A pointer to the blob - const uint32_t size; // Size in bytes. - const uint32_t storageImageCount; // Number of storage images. - const uint32_t sampledImageCount; // Number of sampled images. - const uint32_t uniformBufferCount; // Number of uniform buffers. - const char** boundStorageImageNames; - const uint32_t* boundStorageImageBindings; // Pointer to an array of bound UAV resources. - const char** boundSampledImageNames; - const uint32_t* boundSampledImageBindings; // Pointer to an array of bound SRV resources. - const char** boundUniformBufferNames; - const uint32_t* boundUniformBufferBindings; // Pointer to an array of bound ConstantBuffers. -} Fsr2ShaderBlobVK; + const uint8_t* data; // A pointer to the blob + const uint32_t size; // Size in bytes. + const uint32_t storageImageCount; // Number of storage images. + const uint32_t sampledImageCount; // Number of sampled images. + const uint32_t uniformBufferCount; // Number of uniform buffers. + const char** boundStorageImageNames; + const uint32_t* boundStorageImageBindings; // Pointer to an array of bound UAV resources. + const char** boundSampledImageNames; + const uint32_t* boundSampledImageBindings; // Pointer to an array of bound SRV resources. + const char** boundUniformBufferNames; + const uint32_t* boundUniformBufferBindings; // Pointer to an array of bound ConstantBuffers. + } Fsr2ShaderBlobVK; -// The different options which contribute to permutations. -typedef enum Fs2ShaderPermutationOptionsVK { + // The different options which contribute to permutations. + typedef enum Fs2ShaderPermutationOptionsVK { - FSR2_SHADER_PERMUTATION_LANCZOS_LUT = (1 << 0), // FFX_FSR2_OPTION_USE_LANCZOS_LUT - FSR2_SHADER_PERMUTATION_HDR_COLOR_INPUT = (1 << 1), // FFX_FSR2_OPTION_HDR_COLOR_INPUT - FSR2_SHADER_PERMUTATION_LOW_RES_MOTION_VECTORS = (1 << 2), // FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS - FSR2_SHADER_PERMUTATION_JITTER_MOTION_VECTORS = (1 << 3), // FFX_FSR2_OPTION_JITTERED_MOTION_VECTORS - FSR2_SHADER_PERMUTATION_DEPTH_INVERTED = (1 << 4), // FFX_FSR2_OPTION_INVERTED_DEPTH - FSR2_SHADER_PERMUTATION_ENABLE_SHARPENING = (1 << 5), // FFX_FSR2_OPTION_APPLY_SHARPENING - FSR2_SHADER_PERMUTATION_FORCE_WAVE64 = (1 << 6), // doesn't map to a define, selects different table - FSR2_SHADER_PERMUTATION_ALLOW_FP16 = (1 << 7), // FFX_USE_16BIT -} Fs2ShaderPermutationOptionsVK; + FSR2_SHADER_PERMUTATION_REPROJECT_USE_LANCZOS_TYPE = (1 << 0), // FFX_FSR2_OPTION_REPROJECT_USE_LANCZOS_TYPE + FSR2_SHADER_PERMUTATION_HDR_COLOR_INPUT = (1 << 1), // FFX_FSR2_OPTION_HDR_COLOR_INPUT + FSR2_SHADER_PERMUTATION_LOW_RES_MOTION_VECTORS = (1 << 2), // FFX_FSR2_OPTION_LOW_RESOLUTION_MOTION_VECTORS + FSR2_SHADER_PERMUTATION_JITTER_MOTION_VECTORS = (1 << 3), // FFX_FSR2_OPTION_JITTERED_MOTION_VECTORS + FSR2_SHADER_PERMUTATION_DEPTH_INVERTED = (1 << 4), // FFX_FSR2_OPTION_INVERTED_DEPTH + FSR2_SHADER_PERMUTATION_ENABLE_SHARPENING = (1 << 5), // FFX_FSR2_OPTION_APPLY_SHARPENING + FSR2_SHADER_PERMUTATION_FORCE_WAVE64 = (1 << 6), // doesn't map to a define, selects different table + FSR2_SHADER_PERMUTATION_ALLOW_FP16 = (1 << 7), // FFX_USE_16BIT + } Fs2ShaderPermutationOptionsVK; -// Get a VK shader blob for the specified pass and permutation index. -FfxErrorCode fsr2GetPermutationBlobByIndex(FfxFsr2Pass passId, uint32_t permutationOptions, Fsr2ShaderBlobVK* outBlob); + // Get a VK shader blob for the specified pass and permutation index. + Fsr2ShaderBlobVK fsr2GetPermutationBlobByIndex(FfxFsr2Pass passId, uint32_t permutationOptions); #if defined(__cplusplus) } diff --git a/src/ffx-parallelsort/FFX_ParallelSort.h b/src/ffx-parallelsort/FFX_ParallelSort.h new file mode 100644 index 0000000..fc1ce06 --- /dev/null +++ b/src/ffx-parallelsort/FFX_ParallelSort.h @@ -0,0 +1,514 @@ +// FFX_ParallelSort.h +// +// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#define FFX_PARALLELSORT_SORT_BITS_PER_PASS 4 +#define FFX_PARALLELSORT_SORT_BIN_COUNT (1 << FFX_PARALLELSORT_SORT_BITS_PER_PASS) +#define FFX_PARALLELSORT_ELEMENTS_PER_THREAD 4 +#define FFX_PARALLELSORT_THREADGROUP_SIZE 128 + +////////////////////////////////////////////////////////////////////////// +// ParallelSort constant buffer parameters: +// +// NumKeys The number of keys to sort +// Shift How many bits to shift for this sort pass (we sort 4 bits at a time) +// NumBlocksPerThreadGroup How many blocks of keys each thread group needs to process +// NumThreadGroups How many thread groups are being run concurrently for sort +// NumThreadGroupsWithAdditionalBlocks How many thread groups need to process additional block data +// NumReduceThreadgroupPerBin How many thread groups are summed together for each reduced bin entry +// NumScanValues How many values to perform scan prefix (+ add) on +////////////////////////////////////////////////////////////////////////// + +#ifdef FFX_CPP + struct FFX_ParallelSortCB + { + uint32_t NumKeys; + int32_t NumBlocksPerThreadGroup; + uint32_t NumThreadGroups; + uint32_t NumThreadGroupsWithAdditionalBlocks; + uint32_t NumReduceThreadgroupPerBin; + uint32_t NumScanValues; + }; + + void FFX_ParallelSort_CalculateScratchResourceSize(uint32_t MaxNumKeys, uint32_t& ScratchBufferSize, uint32_t& ReduceScratchBufferSize) + { + uint32_t BlockSize = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; + uint32_t NumBlocks = (MaxNumKeys + BlockSize - 1) / BlockSize; + uint32_t NumReducedBlocks = (NumBlocks + BlockSize - 1) / BlockSize; + + ScratchBufferSize = FFX_PARALLELSORT_SORT_BIN_COUNT * NumBlocks * sizeof(uint32_t); + ReduceScratchBufferSize = FFX_PARALLELSORT_SORT_BIN_COUNT * NumReducedBlocks * sizeof(uint32_t); + } + + void FFX_ParallelSort_SetConstantAndDispatchData(uint32_t NumKeys, uint32_t MaxThreadGroups, FFX_ParallelSortCB& ConstantBuffer, uint32_t& NumThreadGroupsToRun, uint32_t& NumReducedThreadGroupsToRun) + { + ConstantBuffer.NumKeys = NumKeys; + + uint32_t BlockSize = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; + uint32_t NumBlocks = (NumKeys + BlockSize - 1) / BlockSize; + + // Figure out data distribution + NumThreadGroupsToRun = MaxThreadGroups; + uint32_t BlocksPerThreadGroup = (NumBlocks / NumThreadGroupsToRun); + ConstantBuffer.NumThreadGroupsWithAdditionalBlocks = NumBlocks % NumThreadGroupsToRun; + + if (NumBlocks < NumThreadGroupsToRun) + { + BlocksPerThreadGroup = 1; + NumThreadGroupsToRun = NumBlocks; + ConstantBuffer.NumThreadGroupsWithAdditionalBlocks = 0; + } + + ConstantBuffer.NumThreadGroups = NumThreadGroupsToRun; + ConstantBuffer.NumBlocksPerThreadGroup = BlocksPerThreadGroup; + + // Calculate the number of thread groups to run for reduction (each thread group can process BlockSize number of entries) + NumReducedThreadGroupsToRun = FFX_PARALLELSORT_SORT_BIN_COUNT * ((BlockSize > NumThreadGroupsToRun) ? 1 : (NumThreadGroupsToRun + BlockSize - 1) / BlockSize); + ConstantBuffer.NumReduceThreadgroupPerBin = NumReducedThreadGroupsToRun / FFX_PARALLELSORT_SORT_BIN_COUNT; + ConstantBuffer.NumScanValues = NumReducedThreadGroupsToRun; // The number of reduce thread groups becomes our scan count (as each thread group writes out 1 value that needs scan prefix) + } + + // We are using some optimizations to hide buffer load latency, so make sure anyone changing this define is made aware of that fact. + static_assert(FFX_PARALLELSORT_ELEMENTS_PER_THREAD == 4, "FFX_ParallelSort Shaders currently explicitly rely on FFX_PARALLELSORT_ELEMENTS_PER_THREAD being set to 4 in order to optimize buffer loads. Please adjust the optimization to factor in the new define value."); +#elif defined(FFX_HLSL) + + struct FFX_ParallelSortCB + { + uint NumKeys; + int NumBlocksPerThreadGroup; + uint NumThreadGroups; + uint NumThreadGroupsWithAdditionalBlocks; + uint NumReduceThreadgroupPerBin; + uint NumScanValues; + }; + + groupshared uint gs_Histogram[FFX_PARALLELSORT_THREADGROUP_SIZE * FFX_PARALLELSORT_SORT_BIN_COUNT]; + void FFX_ParallelSort_Count_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer SrcBuffer, RWStructuredBuffer SumTable) + { + int i; + // Start by clearing our local counts in LDS + for ( i = 0; i < FFX_PARALLELSORT_SORT_BIN_COUNT; i++) + gs_Histogram[(i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID] = 0; + + // Wait for everyone to catch up + GroupMemoryBarrierWithGroupSync(); + + // Data is processed in blocks, and how many we process can changed based on how much data we are processing + // versus how many thread groups we are processing with + int BlockSize = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; + + // Figure out this thread group's index into the block data (taking into account thread groups that need to do extra reads) + uint ThreadgroupBlockStart = (BlockSize * CBuffer.NumBlocksPerThreadGroup * groupID); + uint NumBlocksToProcess = CBuffer.NumBlocksPerThreadGroup; + + if (groupID >= CBuffer.NumThreadGroups - CBuffer.NumThreadGroupsWithAdditionalBlocks) + { + ThreadgroupBlockStart += (groupID - (CBuffer.NumThreadGroups - CBuffer.NumThreadGroupsWithAdditionalBlocks)) * BlockSize; + NumBlocksToProcess++; + } + + // Get the block start index for this thread + uint BlockIndex = ThreadgroupBlockStart + localID; + + // Count value occurrence + for (uint BlockCount = 0; BlockCount < NumBlocksToProcess; BlockCount++, BlockIndex += BlockSize) + { + uint DataIndex = BlockIndex; + + // Pre-load the key values in order to hide some of the read latency + uint srcKeys[FFX_PARALLELSORT_ELEMENTS_PER_THREAD]; + srcKeys[0] = SrcBuffer[DataIndex]; + srcKeys[1] = SrcBuffer[DataIndex + FFX_PARALLELSORT_THREADGROUP_SIZE]; + srcKeys[2] = SrcBuffer[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 2)]; + srcKeys[3] = SrcBuffer[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 3)]; + + for ( i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) + { + if (DataIndex < CBuffer.NumKeys) + { + uint localKey = (srcKeys[i] >> ShiftBit) & 0xf; + InterlockedAdd(gs_Histogram[(localKey * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID], 1); + DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE; + } + } + } + + // Even though our LDS layout guarantees no collisions, our thread group size is greater than a wave + // so we need to make sure all thread groups are done counting before we start tallying up the results + GroupMemoryBarrierWithGroupSync(); + + if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) + { + uint sum = 0; + for (i = 0; i < FFX_PARALLELSORT_THREADGROUP_SIZE; i++) + { + sum += gs_Histogram[localID * FFX_PARALLELSORT_THREADGROUP_SIZE + i]; + } + SumTable[localID * CBuffer.NumThreadGroups + groupID] = sum; + } + } + + groupshared uint gs_LDSSums[FFX_PARALLELSORT_THREADGROUP_SIZE]; + uint FFX_ParallelSort_ThreadgroupReduce(uint localSum, uint localID) + { + // Do wave local reduce + uint waveReduced = WaveActiveSum(localSum); + + // First lane in a wave writes out wave reduction to LDS (this accounts for num waves per group greater than HW wave size) + // Note that some hardware with very small HW wave sizes (i.e. <= 8) may exhibit issues with this algorithm, and have not been tested. + uint waveID = localID / WaveGetLaneCount(); + if (WaveIsFirstLane()) + gs_LDSSums[waveID] = waveReduced; + + // Wait for everyone to catch up + GroupMemoryBarrierWithGroupSync(); + + // First wave worth of threads sum up wave reductions + if (!waveID) + waveReduced = WaveActiveSum( (localID < FFX_PARALLELSORT_THREADGROUP_SIZE / WaveGetLaneCount()) ? gs_LDSSums[localID] : 0); + + // Returned the reduced sum + return waveReduced; + } + + uint FFX_ParallelSort_BlockScanPrefix(uint localSum, uint localID) + { + // Do wave local scan-prefix + uint wavePrefixed = WavePrefixSum(localSum); + + // Since we are dealing with thread group sizes greater than HW wave size, we need to account for what wave we are in. + uint waveID = localID / WaveGetLaneCount(); + uint laneID = WaveGetLaneIndex(); + + // Last element in a wave writes out partial sum to LDS + if (laneID == WaveGetLaneCount() - 1) + gs_LDSSums[waveID] = wavePrefixed + localSum; + + // Wait for everyone to catch up + GroupMemoryBarrierWithGroupSync(); + + // First wave prefixes partial sums + if (!waveID) + gs_LDSSums[localID] = WavePrefixSum(gs_LDSSums[localID]); + + // Wait for everyone to catch up + GroupMemoryBarrierWithGroupSync(); + + // Add the partial sums back to each wave prefix + wavePrefixed += gs_LDSSums[waveID]; + + return wavePrefixed; + } + + void FFX_ParallelSort_ReduceCount(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, RWStructuredBuffer SumTable, RWStructuredBuffer ReduceTable) + { + // Figure out what bin data we are reducing + uint BinID = groupID / CBuffer.NumReduceThreadgroupPerBin; + uint BinOffset = BinID * CBuffer.NumThreadGroups; + + // Get the base index for this thread group + uint BaseIndex = (groupID % CBuffer.NumReduceThreadgroupPerBin) * FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; + + // Calculate partial sums for entries this thread reads in + uint threadgroupSum = 0; + for (uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; ++i) + { + uint DataIndex = BaseIndex + (i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID; + threadgroupSum += (DataIndex < CBuffer.NumThreadGroups) ? SumTable[BinOffset + DataIndex] : 0; + } + + // Reduce across the entirety of the thread group + threadgroupSum = FFX_ParallelSort_ThreadgroupReduce(threadgroupSum, localID); + + // First thread of the group writes out the reduced sum for the bin + if (!localID) + ReduceTable[groupID] = threadgroupSum; + + // What this will look like in the reduced table is: + // [ [bin0 ... bin0] [bin1 ... bin1] ... ] + } + + // This is to transform uncoalesced loads into coalesced loads and + // then scattered loads from LDS + groupshared int gs_LDS[FFX_PARALLELSORT_ELEMENTS_PER_THREAD][FFX_PARALLELSORT_THREADGROUP_SIZE]; + void FFX_ParallelSort_ScanPrefix(uint numValuesToScan, uint localID, uint groupID, uint BinOffset, uint BaseIndex, bool AddPartialSums, + FFX_ParallelSortCB CBuffer, RWStructuredBuffer ScanSrc, RWStructuredBuffer ScanDst, RWStructuredBuffer ScanScratch) + { + uint i; + // Perform coalesced loads into LDS + for ( i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) + { + uint DataIndex = BaseIndex + (i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID; + + uint col = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) / FFX_PARALLELSORT_ELEMENTS_PER_THREAD; + uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD; + gs_LDS[row][col] = (DataIndex < numValuesToScan) ? ScanSrc[BinOffset + DataIndex] : 0; + } + + // Wait for everyone to catch up + GroupMemoryBarrierWithGroupSync(); + + uint threadgroupSum = 0; + // Calculate the local scan-prefix for current thread + for ( i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) + { + uint tmp = gs_LDS[i][localID]; + gs_LDS[i][localID] = threadgroupSum; + threadgroupSum += tmp; + } + + // Scan prefix partial sums + threadgroupSum = FFX_ParallelSort_BlockScanPrefix(threadgroupSum, localID); + + // Add reduced partial sums if requested + uint partialSum = 0; + if (AddPartialSums) + { + // Partial sum additions are a little special as they are tailored to the optimal number of + // thread groups we ran in the beginning, so need to take that into account + partialSum = ScanScratch[groupID]; + } + + // Add the block scanned-prefixes back in + for (i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) + gs_LDS[i][localID] += threadgroupSum; + + // Wait for everyone to catch up + GroupMemoryBarrierWithGroupSync(); + + // Perform coalesced writes to scan dst + for ( i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) + { + uint DataIndex = BaseIndex + (i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID; + + uint col = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) / FFX_PARALLELSORT_ELEMENTS_PER_THREAD; + uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD; + + if (DataIndex < numValuesToScan) + ScanDst[BinOffset + DataIndex] = gs_LDS[row][col] + partialSum; + } + } + + // Offset cache to avoid loading the offsets all the time + groupshared uint gs_BinOffsetCache[FFX_PARALLELSORT_THREADGROUP_SIZE]; + // Local histogram for offset calculations + groupshared uint gs_LocalHistogram[FFX_PARALLELSORT_SORT_BIN_COUNT]; + // Scratch area for algorithm + groupshared uint gs_LDSScratch[FFX_PARALLELSORT_THREADGROUP_SIZE]; + void FFX_ParallelSort_Scatter_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer SrcBuffer, RWStructuredBuffer DstBuffer, RWStructuredBuffer SumTable +#ifdef kRS_ValueCopy + ,RWStructuredBuffer SrcPayload, RWStructuredBuffer DstPayload +#endif // kRS_ValueCopy + ) + { + // Load the sort bin threadgroup offsets into LDS for faster referencing + if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) + gs_BinOffsetCache[localID] = SumTable[localID * CBuffer.NumThreadGroups + groupID]; + + // Wait for everyone to catch up + GroupMemoryBarrierWithGroupSync(); + + // Data is processed in blocks, and how many we process can changed based on how much data we are processing + // versus how many thread groups we are processing with + int BlockSize = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; + + // Figure out this thread group's index into the block data (taking into account thread groups that need to do extra reads) + uint ThreadgroupBlockStart = (BlockSize * CBuffer.NumBlocksPerThreadGroup * groupID); + uint NumBlocksToProcess = CBuffer.NumBlocksPerThreadGroup; + + if (groupID >= CBuffer.NumThreadGroups - CBuffer.NumThreadGroupsWithAdditionalBlocks) + { + ThreadgroupBlockStart += (groupID - (CBuffer.NumThreadGroups - CBuffer.NumThreadGroupsWithAdditionalBlocks)) * BlockSize; + NumBlocksToProcess++; + } + + // Get the block start index for this thread + uint BlockIndex = ThreadgroupBlockStart + localID; + + // Count value occurences + uint newCount; + for (int BlockCount = 0; BlockCount < NumBlocksToProcess; BlockCount++, BlockIndex += BlockSize) + { + uint DataIndex = BlockIndex; + + // Pre-load the key values in order to hide some of the read latency + uint srcKeys[FFX_PARALLELSORT_ELEMENTS_PER_THREAD]; + srcKeys[0] = SrcBuffer[DataIndex]; + srcKeys[1] = SrcBuffer[DataIndex + FFX_PARALLELSORT_THREADGROUP_SIZE]; + srcKeys[2] = SrcBuffer[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 2)]; + srcKeys[3] = SrcBuffer[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 3)]; + +#ifdef kRS_ValueCopy + uint srcValues[FFX_PARALLELSORT_ELEMENTS_PER_THREAD]; + srcValues[0] = SrcPayload[DataIndex]; + srcValues[1] = SrcPayload[DataIndex + FFX_PARALLELSORT_THREADGROUP_SIZE]; + srcValues[2] = SrcPayload[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 2)]; + srcValues[3] = SrcPayload[DataIndex + (FFX_PARALLELSORT_THREADGROUP_SIZE * 3)]; +#endif // kRS_ValueCopy + + for (int i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) + { + // Clear the local histogram + if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) + gs_LocalHistogram[localID] = 0; + + uint localKey = (DataIndex < CBuffer.NumKeys ? srcKeys[i] : 0xffffffff); +#ifdef kRS_ValueCopy + uint localValue = (DataIndex < CBuffer.NumKeys ? srcValues[i] : 0); +#endif // kRS_ValueCopy + + // Sort the keys locally in LDS + for (uint bitShift = 0; bitShift < FFX_PARALLELSORT_SORT_BITS_PER_PASS; bitShift += 2) + { + // Figure out the keyIndex + uint keyIndex = (localKey >> ShiftBit) & 0xf; + uint bitKey = (keyIndex >> bitShift) & 0x3; + + // Create a packed histogram + uint packedHistogram = 1 << (bitKey * 8); + + // Sum up all the packed keys (generates counted offsets up to current thread group) + uint localSum = FFX_ParallelSort_BlockScanPrefix(packedHistogram, localID); + + // Last thread stores the updated histogram counts for the thread group + // Scratch = 0xsum3|sum2|sum1|sum0 for thread group + if (localID == (FFX_PARALLELSORT_THREADGROUP_SIZE - 1)) + gs_LDSScratch[0] = localSum + packedHistogram; + + // Wait for everyone to catch up + GroupMemoryBarrierWithGroupSync(); + + // Load the sums value for the thread group + packedHistogram = gs_LDSScratch[0]; + + // Add prefix offsets for all 4 bit "keys" (packedHistogram = 0xsum2_1_0|sum1_0|sum0|0) + packedHistogram = (packedHistogram << 8) + (packedHistogram << 16) + (packedHistogram << 24); + + // Calculate the proper offset for this thread's value + localSum += packedHistogram; + + // Calculate target offset + uint keyOffset = (localSum >> (bitKey * 8)) & 0xff; + + // Re-arrange the keys (store, sync, load) + gs_LDSSums[keyOffset] = localKey; + GroupMemoryBarrierWithGroupSync(); + localKey = gs_LDSSums[localID]; + + // Wait for everyone to catch up + GroupMemoryBarrierWithGroupSync(); + +#ifdef kRS_ValueCopy + // Re-arrange the values if we have them (store, sync, load) + gs_LDSSums[keyOffset] = localValue; + GroupMemoryBarrierWithGroupSync(); + localValue = gs_LDSSums[localID]; + + // Wait for everyone to catch up + GroupMemoryBarrierWithGroupSync(); +#endif // kRS_ValueCopy + } + + // Need to recalculate the keyIndex on this thread now that values have been copied around the thread group + uint keyIndex = (localKey >> ShiftBit) & 0xf; + + // Reconstruct histogram + InterlockedAdd(gs_LocalHistogram[keyIndex], 1); + + // Wait for everyone to catch up + GroupMemoryBarrierWithGroupSync(); + + // Prefix histogram + uint histogramPrefixSum = WavePrefixSum(localID < FFX_PARALLELSORT_SORT_BIN_COUNT ? gs_LocalHistogram[localID] : 0); + + // Broadcast prefix-sum via LDS + if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) + gs_LDSScratch[localID] = histogramPrefixSum; + + // Get the global offset for this key out of the cache + uint globalOffset = gs_BinOffsetCache[keyIndex]; + + // Wait for everyone to catch up + GroupMemoryBarrierWithGroupSync(); + + // Get the local offset (at this point the keys are all in increasing order from 0 -> num bins in localID 0 -> thread group size) + uint localOffset = localID - gs_LDSScratch[keyIndex]; + + // Write to destination + uint totalOffset = globalOffset + localOffset; + + if (totalOffset < CBuffer.NumKeys) + { + DstBuffer[totalOffset] = localKey; + +#ifdef kRS_ValueCopy + DstPayload[totalOffset] = localValue; +#endif // kRS_ValueCopy + } + + // Wait for everyone to catch up + GroupMemoryBarrierWithGroupSync(); + + // Update the cached histogram for the next set of entries + if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) + gs_BinOffsetCache[localID] += gs_LocalHistogram[localID]; + + DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE; // Increase the data offset by thread group size + } + } + } + + void FFX_ParallelSort_SetupIndirectParams(uint NumKeys, uint MaxThreadGroups, RWStructuredBuffer CBuffer, RWStructuredBuffer CountScatterArgs, RWStructuredBuffer ReduceScanArgs) + { + CBuffer[0].NumKeys = NumKeys; + + uint BlockSize = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; + uint NumBlocks = (NumKeys + BlockSize - 1) / BlockSize; + + // Figure out data distribution + uint NumThreadGroupsToRun = MaxThreadGroups; + uint BlocksPerThreadGroup = (NumBlocks / NumThreadGroupsToRun); + CBuffer[0].NumThreadGroupsWithAdditionalBlocks = NumBlocks % NumThreadGroupsToRun; + + if (NumBlocks < NumThreadGroupsToRun) + { + BlocksPerThreadGroup = 1; + NumThreadGroupsToRun = NumBlocks; + CBuffer[0].NumThreadGroupsWithAdditionalBlocks = 0; + } + + CBuffer[0].NumThreadGroups = NumThreadGroupsToRun; + CBuffer[0].NumBlocksPerThreadGroup = BlocksPerThreadGroup; + + // Calculate the number of thread groups to run for reduction (each thread group can process BlockSize number of entries) + uint NumReducedThreadGroupsToRun = FFX_PARALLELSORT_SORT_BIN_COUNT * ((BlockSize > NumThreadGroupsToRun) ? 1 : (NumThreadGroupsToRun + BlockSize - 1) / BlockSize); + CBuffer[0].NumReduceThreadgroupPerBin = NumReducedThreadGroupsToRun / FFX_PARALLELSORT_SORT_BIN_COUNT; + CBuffer[0].NumScanValues = NumReducedThreadGroupsToRun; // The number of reduce thread groups becomes our scan count (as each thread group writes out 1 value that needs scan prefix) + + // Setup dispatch arguments + CountScatterArgs[0] = NumThreadGroupsToRun; + CountScatterArgs[1] = 1; + CountScatterArgs[2] = 1; + + ReduceScanArgs[0] = NumReducedThreadGroupsToRun; + ReduceScanArgs[1] = 1; + ReduceScanArgs[2] = 1; + } + +#endif // __cplusplus +