assets/shaders/post/hiz.comp

#version 450
#extension GL_KHR_shader_subgroup_basic : require
#extension GL_KHR_shader_subgroup_quad : require

// A rewrite of SPD to support HiZ correctly and moar wave ops for good measure.

layout(local_size_x = 256) in;

#if defined(WRITE_TOP_LEVEL) && WRITE_TOP_LEVEL
layout(set = 0, binding = 0, r32f) coherent writeonly uniform image2D uImageTop;
#endif
layout(set = 0, binding = 1, r32f) coherent uniform image2D uImages[12];
layout(set = 1, binding = 0) uniform sampler2D uTexture;
layout(set = 1, binding = 1) buffer Counter
{
    uint atomic_counter;
};

layout(push_constant, std430) uniform Registers
{
    mat2 z_transform;
    ivec2 resolution;
    vec2 inv_resolution;
    int mips;
    uint target_counter;
} registers;

ivec2 mip_resolution(int mip)
{
    return max(registers.resolution >> mip, ivec2(1));
}

#define REDUCE_OPERATOR max
float reduce(vec4 v)
{
    vec2 v0 = REDUCE_OPERATOR(v.xy, v.zw);
    return REDUCE_OPERATOR(v0.x, v0.y);
}

const int SHUFFLE_X0 = 1 << 0;
const int SHUFFLE_Y0 = 1 << 1;
const int SHUFFLE_Y1 = 1 << 2;
const int SHUFFLE_X1 = 1 << 3;

uvec2 unswizzle16x16(uint index)
{
    uint x0 = bitfieldExtract(index, 0, 1);
    uint y01 = bitfieldExtract(index, 1, 2);
    uint x12 = bitfieldExtract(index, 3, 2);
    uint y23 = bitfieldExtract(index, 5, 2);
    uint x3 = bitfieldExtract(index, 7, 1);
    return uvec2(bitfieldInsert(bitfieldInsert(x0, x12, 1, 2), x3, 3, 1), bitfieldInsert(y01, y23, 2, 2));
}

vec4 transform_z(vec4 zs)
{
    vec2 z0 = registers.z_transform * vec2(zs.x, 1.0);
    vec2 z1 = registers.z_transform * vec2(zs.y, 1.0);
    vec2 z2 = registers.z_transform * vec2(zs.z, 1.0);
    vec2 z3 = registers.z_transform * vec2(zs.w, 1.0);
    return vec4(z0.x, z1.x, z2.x, z3.x) / vec4(z0.y, z1.y, z2.y, z3.y);
}

void write_image(ivec2 coord, int mip, float v)
{
    // Rely on image robustness to clean up the OOB writes here.
    imageStore(uImages[mip - 1], coord, vec4(v));
}

#if defined(WRITE_TOP_LEVEL) && WRITE_TOP_LEVEL
void write_image_top(ivec2 coord, float v)
{
    imageStore(uImageTop, coord, vec4(v, 0, 0, 0));
}
#endif

const int SHARED_WIDTH = 32;
const int SHARED_HEIGHT = 32;
shared float shared_buffer[SHARED_HEIGHT][SHARED_WIDTH];
shared bool shared_is_last_workgroup;

void store_shared(ivec2 coord, float d)
{
    shared_buffer[coord.y][coord.x] = d;
}

float load_shared(ivec2 coord)
{
    return shared_buffer[coord.y][coord.x];
}

vec4 fetch_2x2_texture(ivec2 base_coord)
{
    vec2 fcoord = vec2(base_coord) * registers.inv_resolution;
    return textureGatherOffset(uTexture, fcoord, ivec2(1, 1)).wzxy;
}

mat4 fetch_4x4_texture(ivec2 base_coord)
{
    vec2 fcoord = vec2(base_coord) * registers.inv_resolution;
    vec4 q00 = textureGatherOffset(uTexture, fcoord, ivec2(1, 1)).wzxy;
    vec4 q10 = textureGatherOffset(uTexture, fcoord, ivec2(3, 1)).wzxy;
    vec4 q01 = textureGatherOffset(uTexture, fcoord, ivec2(1, 3)).wzxy;
    vec4 q11 = textureGatherOffset(uTexture, fcoord, ivec2(3, 3)).wzxy;
    return mat4(q00, q10, q01, q11);
}

vec4 fetch_2x2_image_mip6(ivec2 base_coord)
{
    ivec2 max_coord = mip_resolution(6) - 1;
    float d0 = imageLoad(uImages[5], min(base_coord + ivec2(0, 0), max_coord)).x;
    float d1 = imageLoad(uImages[5], min(base_coord + ivec2(1, 0), max_coord)).x;
    float d2 = imageLoad(uImages[5], min(base_coord + ivec2(0, 1), max_coord)).x;
    float d3 = imageLoad(uImages[5], min(base_coord + ivec2(1, 1), max_coord)).x;
    return vec4(d0, d1, d2, d3);
}

float fetch_image_mip6(ivec2 coord)
{
    return imageLoad(uImages[5], coord).x;
}

vec4 write_mip0_transformed(vec4 v, ivec2 base_coord, ivec2 local_coord)
{
    v = transform_z(v);

#if defined(WRITE_TOP_LEVEL) && WRITE_TOP_LEVEL
    // Ensure that top-level image is written with full cache lines per warp.
    // Writing in the strided 2x2 pattern is noticably bad for L2 performance.
    // Taking extra time on the shader cores to reshuffle data is actually beneficial since we're fully bandwidth bound
    // in these shaders, so we should give the memory system all the help it can get.
    store_shared(2 * local_coord + ivec2(0, 0), v.x);
    store_shared(2 * local_coord + ivec2(1, 0), v.y);
    store_shared(2 * local_coord + ivec2(0, 1), v.z);
    store_shared(2 * local_coord + ivec2(1, 1), v.w);

    barrier();

    // Write out transformed LOD 0
    for (int y = 0; y < 2; y++)
    {
        for (int x = 0; x < 2; x++)
        {
            ivec2 tile_offset = ivec2(x, y) * 16;
            write_image_top(base_coord + tile_offset + local_coord, load_shared(local_coord + tile_offset));
        }
    }

    barrier();
#endif

    return v;
}

// For LOD 0 to 6, it is expected that the division is exact,
// i.e., the lower resolution mip is exactly half resolution.
// This way we avoid needing to fold in neighbors.

float reduce_mip_simple(vec4 v, ivec2 base_coord, int mip)
{
    float reduced = reduce(v);
    write_image(base_coord, mip, reduced);
    return reduced;
}

float reduce_mip_shared(ivec2 base_coord, int mip)
{
    ivec2 mip_res_higher = mip_resolution(mip - 1);
    ivec2 mip_res_target = mip_resolution(mip);

    bool horiz_fold = base_coord.x + 1 == mip_res_target.x && (mip_res_higher.x & 1) != 0;
    bool vert_fold = base_coord.y + 1 == mip_res_target.y && (mip_res_higher.y & 1) != 0;
    bool diag_fold = horiz_fold && vert_fold;

    // Ping-pong the shared buffer to avoid double barrier.
    float d00 = load_shared(2 * base_coord + ivec2(0, 0));
    float d10 = load_shared(2 * base_coord + ivec2(1, 0));
    float d01 = load_shared(2 * base_coord + ivec2(0, 1));
    float d11 = load_shared(2 * base_coord + ivec2(1, 1));

    float reduced = reduce(vec4(d00, d10, d01, d11));

    if (horiz_fold)
    {
        reduced = REDUCE_OPERATOR(reduced, load_shared(2 * base_coord + ivec2(2, 0)));
        reduced = REDUCE_OPERATOR(reduced, load_shared(2 * base_coord + ivec2(2, 1)));
    }

    if (vert_fold)
    {
        reduced = REDUCE_OPERATOR(reduced, load_shared(2 * base_coord + ivec2(0, 2)));
        reduced = REDUCE_OPERATOR(reduced, load_shared(2 * base_coord + ivec2(1, 2)));
    }

    if (diag_fold)
        reduced = REDUCE_OPERATOR(reduced, load_shared(2 * base_coord + ivec2(2, 2)));

    write_image(base_coord, mip, reduced);
    return reduced;
}

void reduce_mip_lod7(ivec2 base_coord)
{
    ivec2 mip_res6 = mip_resolution(6);
    ivec2 mip_res7 = mip_resolution(7);

    float reduced = reduce(fetch_2x2_image_mip6(2 * base_coord));

    // NPOT folding for LOD 7. Our group will write the edge,
    // so need to fold in any last neighbor in previous LOD which may contribute,
    // but would be otherwise lost to the rounding down.

    bool horiz_fold = base_coord.x + 1 == mip_res7.x && (mip_res6.x & 1) != 0;
    bool vert_fold = base_coord.y + 1 == mip_res7.y && (mip_res6.y & 1) != 0;
    bool diag_fold = horiz_fold && vert_fold;

    if (horiz_fold)
    {
        reduced = REDUCE_OPERATOR(reduced, fetch_image_mip6(2 * base_coord + ivec2(2, 0)));
        reduced = REDUCE_OPERATOR(reduced, fetch_image_mip6(2 * base_coord + ivec2(2, 1)));
    }

    if (vert_fold)
    {
        reduced = REDUCE_OPERATOR(reduced, fetch_image_mip6(2 * base_coord + ivec2(0, 2)));
        reduced = REDUCE_OPERATOR(reduced, fetch_image_mip6(2 * base_coord + ivec2(1, 2)));
    }

    if (diag_fold)
        reduced = REDUCE_OPERATOR(reduced, fetch_image_mip6(2 * base_coord + ivec2(2, 2)));

    write_image(base_coord, 7, reduced);
    store_shared(base_coord, reduced);
}

float reduce_mip_simd4(float d, ivec2 base_coord, int mip)
{
    float d_horiz = subgroupQuadSwapHorizontal(d);
    float d_vert = subgroupQuadSwapVertical(d);
    float d_diag = subgroupQuadSwapDiagonal(d);
    d = reduce(vec4(d, d_horiz, d_vert, d_diag));
    if ((gl_SubgroupInvocationID & 3) == 0)
        write_image(base_coord, mip, d);
    return d;
}

// Each workgroup reduces 64x64 on its own.
// Allows reducing up to a 4096x4096 texture, like SPD.

void main()
{
    uint local_index = gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID;
    uvec2 local_coord = unswizzle16x16(local_index);

    bool is_8x8 = local_index < 64u;
    bool is_2x2 = local_index < 4u;

    ivec2 base_coord = ivec2(local_coord) * 2 + ivec2(gl_WorkGroupID.xy * 64u);
    ivec2 base_coord_00 = base_coord + ivec2( 0,  0);
    ivec2 base_coord_10 = base_coord + ivec2(32,  0);
    ivec2 base_coord_01 = base_coord + ivec2( 0, 32);
    ivec2 base_coord_11 = base_coord + ivec2(32, 32);

    // Follow FFX SPD's access pattern here.
    // It seems like we need to be super careful about memory access patterns to get optimal bandwidth.

    // LOD 0 feedback with transform.
    vec4 tile00 = write_mip0_transformed(
        fetch_2x2_texture(base_coord_00), ivec2(gl_WorkGroupID.xy * 64u) + ivec2(0, 0), ivec2(local_coord));
    vec4 tile10 = write_mip0_transformed(
        fetch_2x2_texture(base_coord_10), ivec2(gl_WorkGroupID.xy * 64u) + ivec2(32, 0), ivec2(local_coord));
    vec4 tile01 = write_mip0_transformed(
        fetch_2x2_texture(base_coord_01), ivec2(gl_WorkGroupID.xy * 64u) + ivec2(0, 32), ivec2(local_coord));
    vec4 tile11 = write_mip0_transformed(
        fetch_2x2_texture(base_coord_11), ivec2(gl_WorkGroupID.xy * 64u) + ivec2(32, 32), ivec2(local_coord));
    if (registers.mips <= 1)
        return;

    // Write LOD 1
    ivec2 base_coord_lod1 = base_coord >> 1;
    float reduced00 = reduce_mip_simple(tile00, base_coord_lod1 + ivec2( 0,  0), 1);
    float reduced10 = reduce_mip_simple(tile10, base_coord_lod1 + ivec2(16,  0), 1);
    float reduced01 = reduce_mip_simple(tile01, base_coord_lod1 + ivec2( 0, 16), 1);
    float reduced11 = reduce_mip_simple(tile11, base_coord_lod1 + ivec2(16, 16), 1);
    if (registers.mips <= 2)
        return;

    // Write LOD 2
    ivec2 base_coord_lod2 = base_coord >> 2;
    reduced00 = reduce_mip_simd4(reduced00, base_coord_lod2 + ivec2(0, 0), 2);
    reduced10 = reduce_mip_simd4(reduced10, base_coord_lod2 + ivec2(8, 0), 2);
    reduced01 = reduce_mip_simd4(reduced01, base_coord_lod2 + ivec2(0, 8), 2);
    reduced11 = reduce_mip_simd4(reduced11, base_coord_lod2 + ivec2(8, 8), 2);

    if (registers.mips <= 3)
        return;

    if ((gl_SubgroupInvocationID & 3) == 0)
    {
        ivec2 local_coord_shared = ivec2(local_coord) >> 1;
        store_shared(local_coord_shared + ivec2(0, 0), reduced00);
        store_shared(local_coord_shared + ivec2(8, 0), reduced10);
        store_shared(local_coord_shared + ivec2(0, 8), reduced01);
        store_shared(local_coord_shared + ivec2(8, 8), reduced11);
    }

    barrier();

    // Write LOD 3
    float reduced = 0.0;
    if (is_8x8)
    {
        ivec2 base_coord_lod3 = ivec2(gl_WorkGroupID.xy * 8u) + ivec2(local_coord);
        ivec2 shared_coord = ivec2(local_coord) * 2;
        float d00 = load_shared(shared_coord + ivec2(0, 0));
        float d10 = load_shared(shared_coord + ivec2(1, 0));
        float d01 = load_shared(shared_coord + ivec2(0, 1));
        float d11 = load_shared(shared_coord + ivec2(1, 1));
        reduced = reduce_mip_simple(vec4(d00, d10, d01, d11), base_coord_lod3, 3);

        // Write LOD 4
        if (registers.mips > 4)
            reduced = reduce_mip_simd4(reduced, base_coord_lod3 >> 1, 4);
    }

    if (registers.mips <= 5)
        return;

    // Need this to ensure there is no write-after-read hazard on the shared buffer.
    barrier();

    if (is_8x8 && (gl_SubgroupInvocationID & 3) == 0)
        store_shared(ivec2(local_coord) >> 1, reduced);

    barrier();

    // Write LOD 5.
    if (is_2x2)
    {
        ivec2 base_coord_lod5 = ivec2(gl_WorkGroupID.xy * 2u) + ivec2(local_coord);
        ivec2 shared_coord = ivec2(local_coord) * 2;
        float d00 = load_shared(shared_coord + ivec2(0, 0));
        float d10 = load_shared(shared_coord + ivec2(1, 0));
        float d01 = load_shared(shared_coord + ivec2(0, 1));
        float d11 = load_shared(shared_coord + ivec2(1, 1));
        reduced = reduce_mip_simple(vec4(d00, d10, d01, d11), base_coord_lod5, 5);

        // Write LOD 6
        if (registers.mips > 6)
            reduce_mip_simd4(reduced, base_coord_lod5 >> 1, 6);
    }

    if (registers.mips <= 7)
        return;

    // Persistent waves
    memoryBarrierImage();
    if (local_index == 0)
        shared_is_last_workgroup = atomicAdd(atomic_counter, 1u) + 1u == registers.target_counter;
    barrier();
    if (!shared_is_last_workgroup)
        return;

    // Reset counter for next iteration.
    if (local_index == 0)
        atomic_counter = 0u;

    // Write LOD 7-8, Compute LOD 8
    ivec2 mip_res7 = mip_resolution(7);
    for (int y = 0; y < mip_res7.y; y += 16)
        for (int x = 0; x < mip_res7.x; x += 16)
            reduce_mip_lod7(ivec2(local_coord) + ivec2(x, y));

    for (int mip = 8, invocations = 256; mip <= 12; mip++, invocations /= 4)
    {
        if (registers.mips <= mip)
            break;
        barrier();
        float d;
        if (local_index < invocations)
            d = reduce_mip_shared(ivec2(local_coord), mip);
        barrier();
        if (local_index < invocations)
            store_shared(ivec2(local_coord), d);
    }
}