Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use local memory #122

Closed
timhutton opened this issue Jan 10, 2021 · 0 comments · Fixed by #129
Closed

Use local memory #122

timhutton opened this issue Jan 10, 2021 · 0 comments · Fixed by #129

Comments

@timhutton
Copy link
Member

timhutton commented Jan 10, 2021

Early tests are indicating that this might get us 10% speedup on 3x3 stencils. And the expectation is that larger stencils will see more speedup, because each cell is accessed more times (once by each time it forms part of someone's neighborhood).

(An earlier estimate of larger gains was wrong, because I was comparing different stencil sizes. Which is actually a useful insight - the speed difference between a 5-point Laplacian and a 9-point Laplacian is significant.)

A hand-written kernel:

#define LX 8
#define LY 32
#define LZ 1
#define XR 1
#define YR 1
#define ZR 0
__kernel void rd_compute(__global float4 *a_in,__global float4 *b_in,__global float4 *a_out,__global float4 *b_out)
{
    const int index_x = get_global_id(0);
    const int index_y = get_global_id(1);
    const int index_z = get_global_id(2);
    const int local_x = get_local_id(0);
    const int local_y = get_local_id(1);
    const int local_z = get_local_id(2);
    const int X = get_global_size(0);
    const int Y = get_global_size(1);
    const int Z = get_global_size(2);
    const int index_here = X*(Y*index_z + index_y) + index_x;
    float4 a = a_in[index_here];
    float4 b = b_in[index_here];

    // copy local area into local memory
    local float4 local_a[LX+2*XR][LY+2*YR][LZ+2*ZR]; // expanded to allow access to neighbors
    local float4 local_b[LX+2*XR][LY+2*YR][LZ+2*ZR];
    local_a[local_x+XR][local_y+YR][local_z+ZR] = a;
    local_b[local_x+XR][local_y+YR][local_z+ZR] = b;
    if(local_y==0) {
        local_a[local_x+XR][local_y-1+YR][local_z+ZR] = a_in[X*(Y*index_z + ((index_y-1+Y)%Y)) + index_x];
        local_b[local_x+XR][local_y-1+YR][local_z+ZR] = b_in[X*(Y*index_z + ((index_y-1+Y)%Y)) + index_x];
    }
    else if(local_y==LY-1) {
        local_a[local_x+XR][local_y+1+YR][local_z+ZR] = a_in[X*(Y*index_z + ((index_y+1)%Y)) + index_x];
        local_b[local_x+XR][local_y+1+YR][local_z+ZR] = b_in[X*(Y*index_z + ((index_y+1)%Y)) + index_x];
    }
    if(local_x==0) {
        local_a[local_x-1+XR][local_y+YR][local_z+ZR] = a_in[X*(Y*index_z + index_y) + (index_x-1+X)%X];
        local_b[local_x-1+XR][local_y+YR][local_z+ZR] = b_in[X*(Y*index_z + index_y) + (index_x-1+X)%X];
    }
    else if(local_x==LX-1) {
        local_a[local_x+1+XR][local_y+YR][local_z+ZR] = a_in[X*(Y*index_z + index_y) + (index_x+1)%X];
        local_b[local_x+1+XR][local_y+YR][local_z+ZR] = b_in[X*(Y*index_z + index_y) + (index_x+1)%X];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    float4 a_n = local_a[local_x+XR][local_y-1+YR][local_z+ZR];
    float4 a_s = local_a[local_x+XR][local_y+1+YR][local_z+ZR];
    float4 a_e4 = local_a[local_x+1+XR][local_y+YR][local_z+ZR];
    float4 a_w4 = local_a[local_x-1+XR][local_y+YR][local_z+ZR];
    float4 b_n = local_b[local_x+XR][local_y-1+YR][local_z+ZR];
    float4 b_s = local_b[local_x+XR][local_y+1+YR][local_z+ZR];
    float4 b_e4 = local_b[local_x+1+XR][local_y+YR][local_z+ZR];
    float4 b_w4 = local_b[local_x-1+XR][local_y+YR][local_z+ZR];
    float4 a_e = (float4)(a.yzw, a_e4.x);
    float4 a_w = (float4)(a_w4.w, a.xyz);
    float4 b_e = (float4)(b.yzw, b_e4.x);
    float4 b_w = (float4)(b_w4.w, b.xyz);
    float4 laplacian_a = a_n + a_e + a_s + a_w - 4.0f * a;
    float4 laplacian_b = b_n + b_e + b_s + b_w - 4.0f * b;

    float D_a = 0.082f;
    float D_b = 0.041f;
    float F = 0.035f;
    float k = 0.064f;
    float4 delta_a = D_a * laplacian_a - a*b*b + F*(1.0f-a);
    float4 delta_b = D_b * laplacian_b + a*b*b - (F+k)*b;

    float timestep = 1.0f;
    a_out[index_here] = a + delta_a * timestep;
    b_out[index_here] = b + delta_b * timestep;
}

Here XR, YR, ZR are the radii around each cell that we need to retrieve for the stencils (here 1). LX, LY, LZ are the work group size, and must be chosen such that we don't exceed CL_DEVICE_LOCAL_MEM_SIZE or CL_DEVICE_MAX_WORK_GROUP_SIZE.

We can use local memory for formula image rules.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging a pull request may close this issue.

1 participant