Skip to content

Commit

Permalink
avoid non-uniform barrier control flow when exhausting memory
Browse files Browse the repository at this point in the history
The compute shaders have a check for the succesful completion of their
preceding stage. However, consider a shader execution path like the
following:

	void main()
		if (mem_error != NO_ERROR) {
		    return;
		}
		...
		malloc(...);
		...
		barrier();
		...
	}

and  shader execution that fails to allocate memory, thereby setting
mem_error to ERR_MALLOC_FAILED in malloc before reaching the barrier. If
another shader execution then begins execution, its mem_eror check will
make it return early and not reach the barrier.

All GPU APIs require (dynamically) uniform control flow for barriers,
and the above case may lead to GPU hangs in practice.

Fix this issue by replacing the early exits with careful checks that
don't interrupt barrier control flow.

Unfortunately, it's harder to prove the soundness of the new checks, so
this change also clears dynamic memory ranges in MEM_DEBUG mode when
memory is exhausted. The result is that accessing memory after
exhaustion triggers an error.

Signed-off-by: Elias Naur <[email protected]>
  • Loading branch information
eliasnaur committed Apr 12, 2021
1 parent 74f2003 commit ffdc620
Show file tree
Hide file tree
Showing 16 changed files with 39 additions and 59 deletions.
9 changes: 3 additions & 6 deletions piet-gpu/shader/backdrop.comp
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,13 @@ shared Alloc sh_row_alloc[BACKDROP_WG];
shared uint sh_row_width[BACKDROP_WG];

void main() {
if (mem_error != NO_ERROR) {
return;
}

uint th_ix = gl_LocalInvocationID.x;
uint element_ix = gl_GlobalInvocationID.x;
AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);

// Work assignment: 1 thread : 1 path element
uint row_count = 0;
bool mem_ok = mem_error == NO_ERROR;
if (element_ix < conf.n_elements) {
AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
switch (tag.tag) {
Expand All @@ -67,7 +64,7 @@ void main() {
// long as it doesn't cross the left edge.
row_count = 0;
}
Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
sh_row_alloc[th_ix] = path_alloc;
}
}
Expand Down Expand Up @@ -95,7 +92,7 @@ void main() {
}
}
uint width = sh_row_width[el_ix];
if (width > 0) {
if (width > 0 && mem_ok) {
// Process one row sequentially
// Read backdrop value per tile and prefix sum it
Alloc tiles_alloc = sh_row_alloc[el_ix];
Expand Down
Binary file modified piet-gpu/shader/backdrop.spv
Binary file not shown.
8 changes: 2 additions & 6 deletions piet-gpu/shader/binning.comp
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@ shared Alloc sh_chunk_alloc[N_TILE];
shared bool sh_alloc_failed;

void main() {
if (mem_error != NO_ERROR) {
return;
}

uint my_n_elements = conf.n_elements;
uint my_partition = gl_WorkGroupID.x;

Expand Down Expand Up @@ -105,7 +101,7 @@ void main() {
count[i][gl_LocalInvocationID.x] = element_count;
}
// element_count is number of elements covering bin for this invocation.
Alloc chunk_alloc = new_alloc(0, 0);
Alloc chunk_alloc = new_alloc(0, 0, true);
if (element_count != 0) {
// TODO: aggregate atomic adds (subgroup is probably fastest)
MallocResult chunk = malloc(element_count * BinInstance_size);
Expand All @@ -122,7 +118,7 @@ void main() {
write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);

barrier();
if (sh_alloc_failed) {
if (sh_alloc_failed || mem_error != NO_ERROR) {
return;
}

Expand Down
Binary file modified piet-gpu/shader/binning.spv
Binary file not shown.
31 changes: 14 additions & 17 deletions piet-gpu/shader/coarse.comp
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,17 @@ void write_tile_alloc(uint el_ix, Alloc a) {
sh_tile_alloc[el_ix] = a;
}

Alloc read_tile_alloc(uint el_ix) {
Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
return sh_tile_alloc[el_ix];
}
#else
void write_tile_alloc(uint el_ix, Alloc a) {
// No-op
}

Alloc read_tile_alloc(uint el_ix) {
Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
// All memory.
return new_alloc(0, memory.length()*4);
return new_alloc(0, memory.length()*4, mem_ok);
}
#endif

Expand Down Expand Up @@ -109,10 +109,6 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float
}

void main() {
if (mem_error != NO_ERROR) {
return;
}

// Could use either linear or 2d layouts for both dispatch and
// invocations within the workgroup. We'll use variables to abstract.
uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
Expand Down Expand Up @@ -158,6 +154,7 @@ void main() {

uint num_begin_slots = 0;
uint begin_slot = 0;
bool mem_ok = mem_error == NO_ERROR;
while (true) {
for (uint i = 0; i < N_SLICE; i++) {
sh_bitmaps[i][th_ix] = 0;
Expand All @@ -172,7 +169,7 @@ void main() {
uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
count = read_mem(conf.bin_alloc, in_ix);
uint offset = read_mem(conf.bin_alloc, in_ix + 1);
sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size);
sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size, mem_ok);
}
// prefix sum of counts
for (uint i = 0; i < LG_N_PART_READ; i++) {
Expand All @@ -196,7 +193,7 @@ void main() {
}
// use binary search to find element to read
uint ix = rd_ix + th_ix;
if (ix >= wr_ix && ix < ready_ix) {
if (ix >= wr_ix && ix < ready_ix && mem_ok) {
uint part_ix = 0;
for (uint i = 0; i < LG_N_PART_READ; i++) {
uint probe = part_ix + ((N_PART_READ / 2) >> i);
Expand Down Expand Up @@ -253,7 +250,7 @@ void main() {
// base relative to bin
uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
sh_tile_base[th_ix] = base;
Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
write_tile_alloc(th_ix, path_alloc);
break;
default:
Expand Down Expand Up @@ -288,11 +285,11 @@ void main() {
uint width = sh_tile_width[el_ix];
uint x = sh_tile_x0[el_ix] + seq_ix % width;
uint y = sh_tile_y0[el_ix] + seq_ix / width;
bool include_tile;
bool include_tile = false;
if (tag == Annotated_BeginClip || tag == Annotated_EndClip) {
include_tile = true;
} else {
Tile tile = Tile_read(read_tile_alloc(el_ix), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
} else if (mem_ok) {
Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
// Include the path in the tile if
// - the tile contains at least a segment (tile offset non-zero)
// - the tile is completely covered (backdrop non-zero)
Expand All @@ -311,7 +308,7 @@ void main() {
// through the non-segment elements.
uint slice_ix = 0;
uint bitmap = sh_bitmaps[0][th_ix];
while (true) {
while (mem_ok) {
if (bitmap == 0) {
slice_ix++;
if (slice_ix == N_SLICE) {
Expand All @@ -337,7 +334,7 @@ void main() {
if (clip_zero_depth == 0) {
switch (tag.tag) {
case Annotated_Color:
Tile tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
AnnoColor fill = Annotated_Color_read(conf.anno_alloc, ref);
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
Expand All @@ -348,7 +345,7 @@ void main() {
cmd_ref.offset += 4 + CmdColor_size;
break;
case Annotated_Image:
tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
AnnoImage fill_img = Annotated_Image_read(conf.anno_alloc, ref);
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
Expand All @@ -359,7 +356,7 @@ void main() {
cmd_ref.offset += 4 + CmdImage_size;
break;
case Annotated_BeginClip:
tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix]
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
if (tile.tile.offset == 0 && tile.backdrop == 0) {
clip_zero_depth = clip_depth + 1;
Expand Down
Binary file modified piet-gpu/shader/coarse.spv
Binary file not shown.
5 changes: 0 additions & 5 deletions piet-gpu/shader/elements.comp
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,6 @@ shared uint sh_part_ix;
shared State sh_prefix;

void main() {
if (mem_error != NO_ERROR) {
return;
}

State th_state[N_ROWS];
// Determine partition to process by atomic counter (described in Section
// 4.4 of prefix sum paper).
Expand Down Expand Up @@ -392,7 +388,6 @@ void main() {
vec2 lw = get_linewidth(st);
anno_begin_clip.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
} else {
anno_begin_clip.bbox = begin_clip.bbox;
anno_fill.linewidth = 0.0;
}
out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
Expand Down
Binary file modified piet-gpu/shader/elements.spv
Binary file not shown.
11 changes: 4 additions & 7 deletions piet-gpu/shader/kernel4.comp
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,6 @@ vec4[CHUNK] fillImage(uvec2 xy, CmdImage cmd_img) {
}

void main() {
if (mem_error != NO_ERROR) {
return;
}

uint tile_ix = gl_WorkGroupID.y * conf.width_in_tiles + gl_WorkGroupID.x;
Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
Expand Down Expand Up @@ -118,7 +114,8 @@ void main() {

float area[CHUNK];
uint clip_depth = 0;
while (true) {
bool mem_ok = mem_error == NO_ERROR;
while (mem_ok) {
uint tag = Cmd_tag(cmd_alloc, cmd_ref).tag;
if (tag == Cmd_End) {
break;
Expand All @@ -131,7 +128,7 @@ void main() {
for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
do {
TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref);
vec2 line_vec = seg.vector;
for (uint k = 0; k < CHUNK; k++) {
vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
Expand All @@ -152,7 +149,7 @@ void main() {
tile_seg_ref = TileSegRef(fill.tile_ref);
// Calculate coverage based on backdrop + coverage of each line segment
do {
TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size), tile_seg_ref);
TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref);
for (uint k = 0; k < CHUNK; k++) {
vec2 my_xy = xy + vec2(chunk_offset(k));
vec2 start = seg.origin - my_xy;
Expand Down
Binary file modified piet-gpu/shader/kernel4.spv
Binary file not shown.
Binary file modified piet-gpu/shader/kernel4_idx.spv
Binary file not shown.
19 changes: 12 additions & 7 deletions piet-gpu/shader/mem.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,23 +44,26 @@ struct MallocResult {
};

// new_alloc synthesizes an Alloc from an offset and size.
Alloc new_alloc(uint offset, uint size) {
Alloc new_alloc(uint offset, uint size, bool mem_ok) {
Alloc a;
a.offset = offset;
#ifdef MEM_DEBUG
a.size = size;
if (mem_ok) {
a.size = size;
} else {
a.size = 0;
}
#endif
return a;
}

// malloc allocates size bytes of memory.
MallocResult malloc(uint size) {
MallocResult r;
r.failed = false;
uint offset = atomicAdd(mem_offset, size);
r.alloc = new_alloc(offset, size);
if (offset + size > memory.length() * 4) {
r.failed = true;
r.failed = offset + size > memory.length() * 4;
r.alloc = new_alloc(offset, size, !r.failed);
if (r.failed) {
atomicMax(mem_error, ERR_MALLOC_FAILED);
return r;
}
Expand Down Expand Up @@ -119,8 +122,10 @@ Alloc slice_mem(Alloc a, uint offset, uint size) {
// but never written.
return Alloc(0, 0);
}
return Alloc(a.offset + offset, size);
#else
return Alloc(a.offset + offset);
#endif
return new_alloc(a.offset + offset, size);
}

// alloc_write writes alloc to memory at offset bytes.
Expand Down
9 changes: 3 additions & 6 deletions piet-gpu/shader/path_coarse.comp
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,14 @@ SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
}

void main() {
if (mem_error != NO_ERROR) {
return;
}

uint element_ix = gl_GlobalInvocationID.x;
PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);

PathSegTag tag = PathSegTag(PathSeg_Nop, 0);
if (element_ix < conf.n_pathseg) {
tag = PathSeg_tag(conf.pathseg_alloc, ref);
}
bool mem_ok = mem_error == NO_ERROR;
switch (tag.tag) {
case PathSeg_Cubic:
PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref);
Expand Down Expand Up @@ -135,7 +132,7 @@ void main() {
bool is_stroke = fill_mode_from_flags(tag.flags) == MODE_STROKE;
uint path_ix = cubic.path_ix;
Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
ivec4 bbox = ivec4(path.bbox);
vec2 p0 = cubic.p0;
qp0 = cubic.p0;
Expand Down Expand Up @@ -195,7 +192,7 @@ void main() {
uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
// Consider using subgroups to aggregate atomic add.
MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size);
if (tile_alloc.failed) {
if (tile_alloc.failed || !mem_ok) {
return;
}
uint tile_offset = tile_alloc.alloc.offset;
Expand Down
Binary file modified piet-gpu/shader/path_coarse.spv
Binary file not shown.
6 changes: 1 addition & 5 deletions piet-gpu/shader/tile_alloc.comp
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,6 @@ shared uint sh_tile_count[TILE_ALLOC_WG];
shared MallocResult sh_tile_alloc;

void main() {
if (mem_error != NO_ERROR) {
return;
}

uint th_ix = gl_LocalInvocationID.x;
uint element_ix = gl_GlobalInvocationID.x;
PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
Expand Down Expand Up @@ -86,7 +82,7 @@ void main() {
}
barrier();
MallocResult alloc_start = sh_tile_alloc;
if (alloc_start.failed) {
if (alloc_start.failed || mem_error != NO_ERROR) {
return;
}

Expand Down
Binary file modified piet-gpu/shader/tile_alloc.spv
Binary file not shown.

0 comments on commit ffdc620

Please sign in to comment.