avoid non-uniform barrier control flow when exhausting memory

The compute shaders have a check for the succesful completion of their preceding stage. However, consider a shader execution path like the following: void main() if (mem_error != NO_ERROR) { return; } ... malloc(...); ... barrier(); ... } and shader execution that fails to allocate memory, thereby setting mem_error to ERR_MALLOC_FAILED in malloc before reaching the barrier. If another shader execution then begins execution, its mem_eror check will make it return early and not reach the barrier. All GPU APIs require (dynamically) uniform control flow for barriers, and the above case may lead to GPU hangs in practice. Fix this issue by replacing the early exits with careful checks that don't interrupt barrier control flow. Unfortunately, it's harder to prove the soundness of the new checks, so this change also clears dynamic memory ranges in MEM_DEBUG mode when memory is exhausted. The result is that accessing memory after exhaustion triggers an error. Signed-off-by: Elias Naur <[email protected]>
eliasnaur · Apr 12, 2021 · ffdc620 · ffdc620
1 parent 74f2003
commit ffdc620
Show file tree

Hide file tree

Showing 16 changed files with 39 additions and 59 deletions.
diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp
@@ -35,16 +35,13 @@ shared Alloc sh_row_alloc[BACKDROP_WG];
 shared uint sh_row_width[BACKDROP_WG];
 
 void main() {
-    if (mem_error != NO_ERROR) {
-        return;
-    }
-
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
     AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
 
     // Work assignment: 1 thread : 1 path element
     uint row_count = 0;
+    bool mem_ok = mem_error == NO_ERROR;
     if (element_ix < conf.n_elements) {
         AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
         switch (tag.tag) {
@@ -67,7 +64,7 @@ void main() {
                 // long as it doesn't cross the left edge.
                 row_count = 0;
             }
-            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
+            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
             sh_row_alloc[th_ix] = path_alloc;
         }
     }
@@ -95,7 +92,7 @@ void main() {
             }
         }
         uint width = sh_row_width[el_ix];
-        if (width > 0) {
+        if (width > 0 && mem_ok) {
             // Process one row sequentially
             // Read backdrop value per tile and prefix sum it
             Alloc tiles_alloc = sh_row_alloc[el_ix];

diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv
diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
@@ -36,10 +36,6 @@ shared Alloc sh_chunk_alloc[N_TILE];
 shared bool sh_alloc_failed;
 
 void main() {
-    if (mem_error != NO_ERROR) {
-        return;
-    }
-
     uint my_n_elements = conf.n_elements;
     uint my_partition = gl_WorkGroupID.x;
 
@@ -105,7 +101,7 @@ void main() {
         count[i][gl_LocalInvocationID.x] = element_count;
     }
     // element_count is number of elements covering bin for this invocation.
-    Alloc chunk_alloc = new_alloc(0, 0);
+    Alloc chunk_alloc = new_alloc(0, 0, true);
     if (element_count != 0) {
         // TODO: aggregate atomic adds (subgroup is probably fastest)
         MallocResult chunk = malloc(element_count * BinInstance_size);
@@ -122,7 +118,7 @@ void main() {
     write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
 
     barrier();
-    if (sh_alloc_failed) {
+    if (sh_alloc_failed || mem_error != NO_ERROR) {
         return;
     }
 

diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
@@ -56,17 +56,17 @@ void write_tile_alloc(uint el_ix, Alloc a) {
     sh_tile_alloc[el_ix] = a;
 }
 
-Alloc read_tile_alloc(uint el_ix) {
+Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
     return sh_tile_alloc[el_ix];
 }
 #else
 void write_tile_alloc(uint el_ix, Alloc a) {
     // No-op
 }
 
-Alloc read_tile_alloc(uint el_ix) {
+Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
     // All memory.
-    return new_alloc(0, memory.length()*4);
+    return new_alloc(0, memory.length()*4, mem_ok);
 }
 #endif
 
@@ -109,10 +109,6 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float
 }
 
 void main() {
-    if (mem_error != NO_ERROR) {
-        return;
-    }
-
     // Could use either linear or 2d layouts for both dispatch and
     // invocations within the workgroup. We'll use variables to abstract.
     uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
@@ -158,6 +154,7 @@ void main() {
 
     uint num_begin_slots = 0;
     uint begin_slot = 0;
+    bool mem_ok = mem_error == NO_ERROR;
     while (true) {
         for (uint i = 0; i < N_SLICE; i++) {
             sh_bitmaps[i][th_ix] = 0;
@@ -172,7 +169,7 @@ void main() {
                     uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
                     count = read_mem(conf.bin_alloc, in_ix);
                     uint offset = read_mem(conf.bin_alloc, in_ix + 1);
-                    sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size);
+                    sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size, mem_ok);
                 }
                 // prefix sum of counts
                 for (uint i = 0; i < LG_N_PART_READ; i++) {
@@ -196,7 +193,7 @@ void main() {
             }
             // use binary search to find element to read
             uint ix = rd_ix + th_ix;
-            if (ix >= wr_ix && ix < ready_ix) {
+            if (ix >= wr_ix && ix < ready_ix && mem_ok) {
                 uint part_ix = 0;
                 for (uint i = 0; i < LG_N_PART_READ; i++) {
                     uint probe = part_ix + ((N_PART_READ / 2) >> i);
@@ -253,7 +250,7 @@ void main() {
             // base relative to bin
             uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
             sh_tile_base[th_ix] = base;
-            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
+            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
             write_tile_alloc(th_ix, path_alloc);
             break;
         default:
@@ -288,11 +285,11 @@ void main() {
             uint width = sh_tile_width[el_ix];
             uint x = sh_tile_x0[el_ix] + seq_ix % width;
             uint y = sh_tile_y0[el_ix] + seq_ix / width;
-            bool include_tile;
+            bool include_tile = false;
             if (tag == Annotated_BeginClip || tag == Annotated_EndClip) {
                 include_tile = true;
-            } else {
-                Tile tile = Tile_read(read_tile_alloc(el_ix), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
+            } else if (mem_ok) {
+                Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
                 // Include the path in the tile if
                 // - the tile contains at least a segment (tile offset non-zero)
                 // - the tile is completely covered (backdrop non-zero)
@@ -311,7 +308,7 @@ void main() {
         // through the non-segment elements.
         uint slice_ix = 0;
         uint bitmap = sh_bitmaps[0][th_ix];
-        while (true) {
+        while (mem_ok) {
             if (bitmap == 0) {
                 slice_ix++;
                 if (slice_ix == N_SLICE) {
@@ -337,7 +334,7 @@ void main() {
             if (clip_zero_depth == 0) {
                 switch (tag.tag) {
                 case Annotated_Color:
-                    Tile tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
+                    Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
                         + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                     AnnoColor fill = Annotated_Color_read(conf.anno_alloc, ref);
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
@@ -348,7 +345,7 @@ void main() {
                     cmd_ref.offset += 4 + CmdColor_size;
                     break;
                 case Annotated_Image:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
+                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
                         + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                     AnnoImage fill_img = Annotated_Image_read(conf.anno_alloc, ref);
                     if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
@@ -359,7 +356,7 @@ void main() {
                     cmd_ref.offset += 4 + CmdImage_size;
                     break;
                 case Annotated_BeginClip:
-                    tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
+                    tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
                         + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                     if (tile.tile.offset == 0 && tile.backdrop == 0) {
                         clip_zero_depth = clip_depth + 1;

diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
@@ -176,10 +176,6 @@ shared uint sh_part_ix;
 shared State sh_prefix;
 
 void main() {
-    if (mem_error != NO_ERROR) {
-        return;
-    }
-
     State th_state[N_ROWS];
     // Determine partition to process by atomic counter (described in Section
     // 4.4 of prefix sum paper).
@@ -392,7 +388,6 @@ void main() {
                 vec2 lw = get_linewidth(st);
                 anno_begin_clip.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
             } else {
-                anno_begin_clip.bbox = begin_clip.bbox;
                 anno_fill.linewidth = 0.0;
             }
             out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);

diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
@@ -85,10 +85,6 @@ vec4[CHUNK] fillImage(uvec2 xy, CmdImage cmd_img) {
 }
 
 void main() {
-    if (mem_error != NO_ERROR) {
-        return;
-    }
-
     uint tile_ix = gl_WorkGroupID.y * conf.width_in_tiles + gl_WorkGroupID.x;
     Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
     CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
@@ -118,7 +114,8 @@ void main() {
 
     float area[CHUNK];
     uint clip_depth = 0;
-    while (true) {
+    bool mem_ok = mem_error == NO_ERROR;
+    while (mem_ok) {
         uint tag = Cmd_tag(cmd_alloc, cmd_ref).tag;
         if (tag == Cmd_End) {
             break;
@@ -131,7 +128,7 @@ void main() {
             for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
             TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
             do {
-                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
+                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref);
                 vec2 line_vec = seg.vector;
                 for (uint k = 0; k < CHUNK; k++) {
                     vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
@@ -152,7 +149,7 @@ void main() {
             tile_seg_ref = TileSegRef(fill.tile_ref);
             // Calculate coverage based on backdrop + coverage of each line segment
             do {
-                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
+                TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref);
                 for (uint k = 0; k < CHUNK; k++) {
                     vec2 my_xy = xy + vec2(chunk_offset(k));
                     vec2 start = seg.origin - my_xy;

diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
diff --git a/piet-gpu/shader/kernel4_idx.spv b/piet-gpu/shader/kernel4_idx.spv
diff --git a/piet-gpu/shader/mem.h b/piet-gpu/shader/mem.h
@@ -44,23 +44,26 @@ struct MallocResult {
 };
 
 // new_alloc synthesizes an Alloc from an offset and size.
-Alloc new_alloc(uint offset, uint size) {
+Alloc new_alloc(uint offset, uint size, bool mem_ok) {
     Alloc a;
     a.offset = offset;
 #ifdef MEM_DEBUG
-    a.size = size;
+    if (mem_ok) {
+        a.size = size;
+    } else {
+        a.size = 0;
+    }
 #endif
     return a;
 }
 
 // malloc allocates size bytes of memory.
 MallocResult malloc(uint size) {
     MallocResult r;
-    r.failed = false;
     uint offset = atomicAdd(mem_offset, size);
-    r.alloc = new_alloc(offset, size);
-    if (offset + size > memory.length() * 4) {
-        r.failed = true;
+    r.failed = offset + size > memory.length() * 4;
+    r.alloc = new_alloc(offset, size, !r.failed);
+    if (r.failed) {
         atomicMax(mem_error, ERR_MALLOC_FAILED);
         return r;
     }
@@ -119,8 +122,10 @@ Alloc slice_mem(Alloc a, uint offset, uint size) {
         // but never written.
         return Alloc(0, 0);
     }
+    return Alloc(a.offset + offset, size);
+#else
+    return Alloc(a.offset + offset);
 #endif
-    return new_alloc(a.offset + offset, size);
 }
 
 // alloc_write writes alloc to memory at offset bytes.

diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp
@@ -87,17 +87,14 @@ SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
 }
 
 void main() {
-    if (mem_error != NO_ERROR) {
-        return;
-    }
-
     uint element_ix = gl_GlobalInvocationID.x;
     PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);
 
     PathSegTag tag = PathSegTag(PathSeg_Nop, 0);
     if (element_ix < conf.n_pathseg) {
         tag = PathSeg_tag(conf.pathseg_alloc, ref);
     }
+    bool mem_ok = mem_error == NO_ERROR;
     switch (tag.tag) {
     case PathSeg_Cubic:
         PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref);
@@ -135,7 +132,7 @@ void main() {
         bool is_stroke = fill_mode_from_flags(tag.flags) == MODE_STROKE;
         uint path_ix = cubic.path_ix;
         Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
-        Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
+        Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
         ivec4 bbox = ivec4(path.bbox);
         vec2 p0 = cubic.p0;
         qp0 = cubic.p0;
@@ -195,7 +192,7 @@ void main() {
                 uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
                 // Consider using subgroups to aggregate atomic add.
                 MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size);
-                if (tile_alloc.failed) {
+                if (tile_alloc.failed || !mem_ok) {
                     return;
                 }
                 uint tile_offset = tile_alloc.alloc.offset;

diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv
diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp
@@ -28,10 +28,6 @@ shared uint sh_tile_count[TILE_ALLOC_WG];
 shared MallocResult sh_tile_alloc;
 
 void main() {
-    if (mem_error != NO_ERROR) {
-        return;
-    }
-
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
     PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
@@ -86,7 +82,7 @@ void main() {
     }
     barrier();
     MallocResult alloc_start = sh_tile_alloc;
-    if (alloc_start.failed) {
+    if (alloc_start.failed || mem_error != NO_ERROR) {
         return;
     }
 

diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv