From 7cb397bef54dae2e981cdcae5f2bbd67a945f22a Mon Sep 17 00:00:00 2001 From: Alex Reinking Date: Tue, 2 Jun 2020 16:03:14 -0700 Subject: [PATCH 01/29] Make all tests compile with threads to silence clang's errors and GCC's warnings. --- cmake/HalideTestHelpers.cmake | 2 +- test/correctness/CMakeLists.txt | 13 ------------- test/performance/CMakeLists.txt | 5 +++-- test/warning/CMakeLists.txt | 2 +- 4 files changed, 5 insertions(+), 17 deletions(-) diff --git a/cmake/HalideTestHelpers.cmake b/cmake/HalideTestHelpers.cmake index 42f9fab1177f..c023a8b5150a 100644 --- a/cmake/HalideTestHelpers.cmake +++ b/cmake/HalideTestHelpers.cmake @@ -28,7 +28,7 @@ if (NOT TARGET Halide::Test) add_library(Halide::Test ALIAS Halide_test) # Obviously, link to the main library - target_link_libraries(Halide_test INTERFACE Halide::Halide) + target_link_libraries(Halide_test INTERFACE Halide::Halide Threads::Threads) # Everyone gets to see the common headers target_include_directories(Halide_test diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 8fbe20ddb3ab..99226d16a4f1 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -335,19 +335,6 @@ tests(GROUPS correctness # Make sure the test that needs image_io has it target_link_libraries(correctness_image_io PRIVATE Halide::ImageIO) -# Some tests explicitly need threads. -foreach (TEST IN ITEMS - correctness_boundary_conditions - correctness_gpu_allocation_cache - correctness_mul_div_mod - correctness_simd_op_check - correctness_simd_op_check_hvx - correctness_thread_safety - correctness_vector_cast - correctness_vector_math) - target_link_libraries(${TEST} PRIVATE Threads::Threads) -endforeach () - # Tests which use external funcs need to enable exports. foreach (TEST IN ITEMS correctness_async diff --git a/test/performance/CMakeLists.txt b/test/performance/CMakeLists.txt index 7341dfc89b88..a9597c729e7b 100644 --- a/test/performance/CMakeLists.txt +++ b/test/performance/CMakeLists.txt @@ -29,8 +29,9 @@ tests(GROUPS performance wrap.cpp ) +# Make sure that performance tests do not run in parallel with other tests, +# since doing so might make them flaky. set_tests_properties(${TEST_NAMES} PROPERTIES RUN_SERIAL TRUE) -target_link_libraries(performance_thread_safe_jit PRIVATE Threads::Threads) - +# This test needs rdynamic or equivalent set_target_properties(performance_fast_pow PROPERTIES ENABLE_EXPORTS TRUE) diff --git a/test/warning/CMakeLists.txt b/test/warning/CMakeLists.txt index 8b1f12dc4f11..a9b6a5ff1eb6 100644 --- a/test/warning/CMakeLists.txt +++ b/test/warning/CMakeLists.txt @@ -6,5 +6,5 @@ tests(GROUPS warning sliding_vectors.cpp ) -# Don't look for "Success!" in warning tests. +# Don't look for "Success!" in warning tests, look for "Warning:" instead. set_tests_properties(${TEST_NAMES} PROPERTIES PASS_REGULAR_EXPRESSION "Warning:") From 6c041b825da75cb23e6179ad428f129c0ef1245e Mon Sep 17 00:00:00 2001 From: Alex Reinking Date: Tue, 2 Jun 2020 16:03:50 -0700 Subject: [PATCH 02/29] Drive-by docs fix to make sure that when compiling for Win32, users use the 64bit host tools anyway (avoid out of memory errors building LLVM) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f1d888046841..e3f670139a71 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,7 @@ D:\> "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary For a 32-bit build, run: ``` -D:\> "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" x86 +D:\> "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 ``` #### Managing dependencies with vcpkg From 5e469ad80e8d9d12acc402b5223c0d49db7d766d Mon Sep 17 00:00:00 2001 From: Alex Reinking Date: Tue, 2 Jun 2020 16:04:14 -0700 Subject: [PATCH 03/29] Make sure that generators run with the full PATH available on Windows. --- cmake/HalideGeneratorHelpers.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/HalideGeneratorHelpers.cmake b/cmake/HalideGeneratorHelpers.cmake index 4b0d9aac212d..2a9c3d9b0baf 100644 --- a/cmake/HalideGeneratorHelpers.cmake +++ b/cmake/HalideGeneratorHelpers.cmake @@ -133,7 +133,9 @@ function(add_halide_library TARGET) # On Linux, RPATH allows the generator to find Halide, but we need to add it to the PATH on Windows. set(generatorCommand ${ARG_FROM}) if (WIN32) - set(generatorCommand ${CMAKE_COMMAND} -E env "PATH=$>" "$") + set(newPath "$" $ENV{PATH}) + string(REPLACE ";" "$" newPath "${newPath}") + set(generatorCommand ${CMAKE_COMMAND} -E env "PATH=$" "$") endif () # The output file name might not match the host when cross compiling. From 857ce2d7964faff0cfd10cad6a950039eada1fb2 Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Wed, 3 Jun 2020 13:10:06 -0400 Subject: [PATCH 04/29] Fix intentional read-before-write of shared mem allocation --- test/correctness/gpu_thread_barrier.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/correctness/gpu_thread_barrier.cpp b/test/correctness/gpu_thread_barrier.cpp index a4d18199f8c5..66b8dc28bde9 100644 --- a/test/correctness/gpu_thread_barrier.cpp +++ b/test/correctness/gpu_thread_barrier.cpp @@ -106,7 +106,8 @@ int main(int argc, char **argv) { Func f; Var x, y; - f(x, y) = undef(); + f(x, y) = 0; + f(x, y) += undef(); f(x, y) += x + 100 * y; // This next line is dubious, because it entirely masks the // effect of the previous definition. If you add an undefined @@ -128,9 +129,10 @@ int main(int argc, char **argv) { f.update(1).gpu_threads(x, y); f.update(2).gpu_threads(x, y); - // There should be two thread barriers: one in between the + // There should be three thread barriers: one after the intial + // pure definition, one in between the // non-undef definitions, and one between f and g. - g.add_custom_lowering_pass(new CheckBarrierCount(2)); + g.add_custom_lowering_pass(new CheckBarrierCount(3)); Buffer out = g.realize(100, 100); } From 09c343a9d96f7e79e8fc8087bcedcf0e2c5389f2 Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Wed, 3 Jun 2020 16:01:38 -0400 Subject: [PATCH 05/29] Use a schedule that requires less shared mem for D3D12. --- apps/camera_pipe/camera_pipe_generator.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/apps/camera_pipe/camera_pipe_generator.cpp b/apps/camera_pipe/camera_pipe_generator.cpp index 9a5d154476c8..4fedec0eac38 100644 --- a/apps/camera_pipe/camera_pipe_generator.cpp +++ b/apps/camera_pipe/camera_pipe_generator.cpp @@ -459,11 +459,22 @@ void CameraPipe::generate() { Var xi, yi, xii, xio; - /* 1391us on a gtx 980. */ + /* These tile factors obtain 1391us on a gtx 980. */ + int tile_x = 28; + int tile_y = 12; + + if (get_target().has_feature(Target::D3D12Compute)) { + // D3D12 can only utilize a limited amount of + // shared memory, so we use a slightly smaller + // tile size. + tile_x = 20; + tile_y = 12; + } + processed.compute_root() .reorder(c, x, y) .unroll(x, 2) - .gpu_tile(x, y, xi, yi, 28, 12); + .gpu_tile(x, y, xi, yi, tile_x, tile_y); curved.compute_at(processed, x) .unroll(x, 2) From 678ad064870e7dc9b3f8521fd8a3ab917a0d56f5 Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Wed, 3 Jun 2020 17:22:06 -0400 Subject: [PATCH 06/29] Try Alex's suggested change --- src/runtime/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt index f648aec33af8..664380749d43 100644 --- a/src/runtime/CMakeLists.txt +++ b/src/runtime/CMakeLists.txt @@ -165,8 +165,8 @@ foreach (i IN LISTS RUNTIME_CPP) set(SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/${i}.cpp") - set(RUNTIME_DEFINES -DCOMPILING_HALIDE_RUNTIME -DLLVM_VERSION=${LLVM_VERSION} -DBITS_${j}) - set(RUNTIME_DEFINES_debug -g -DDEBUG_RUNTIME ${RUNTIME_DEFINES}) + set(RUNTIME_DEFINES -g -DCOMPILING_HALIDE_RUNTIME -DLLVM_VERSION=${LLVM_VERSION} -DBITS_${j}) + set(RUNTIME_DEFINES_debug -DDEBUG_RUNTIME ${RUNTIME_DEFINES}) foreach (SUFFIX IN ITEMS "" "_debug") set(LL "initmod.${i}_${j}${SUFFIX}.ll") From 8e27b3fac13855da6671ad505a766b6d8fb9a8ee Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Wed, 3 Jun 2020 17:43:55 -0400 Subject: [PATCH 07/29] Fix rest of apps tests --- src/runtime/d3d12compute.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index 220d7282506c..f4a171893a1e 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -1637,8 +1637,9 @@ static d3d12_function *new_function_with_name(d3d12_device *device, d3d12_librar int shared_mem_bytes, int threadsX, int threadsY, int threadsZ) { TRACELOG; - // Round shared memory size up to a multiple of 16: - shared_mem_bytes = (shared_mem_bytes + 0xF) & ~0xF; + // Round shared memory size up to a non-zero multiple of 16 + TRACEPRINT("groupshared memory size before modification: " << shared_mem_bytes << "\n"); + shared_mem_bytes = ((shared_mem_bytes > 0 ? shared_mem_bytes : 1) + 0xF) & ~0xF; TRACEPRINT("groupshared memory size: " << shared_mem_bytes << " bytes.\n"); TRACEPRINT("numthreads( " << threadsX << ", " << threadsY << ", " << threadsZ << " )\n"); From 748143c9e5be35df3bff29939503af683ae160e0 Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Wed, 3 Jun 2020 19:41:37 -0400 Subject: [PATCH 08/29] Use same strategy as Metal for hist app --- apps/hist/hist_generator.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/hist/hist_generator.cpp b/apps/hist/hist_generator.cpp index 065255873cd8..1767d176f22b 100644 --- a/apps/hist/hist_generator.cpp +++ b/apps/hist/hist_generator.cpp @@ -114,10 +114,11 @@ class Hist : public Halide::Generator { // along the z dimension. hist_rows.update().gpu_tile(x, y, xi, yi, 32, 8); - if (!get_target().has_feature(Target::Metal)) { + if (!get_target().has_feature(Target::Metal) && + !get_target().has_feature(Target::D3D12Compute)) { // bound_extent doesn't currently work inside - // metal kernels because we can't compile the - // assertion. For metal we just inline the + // metal & d3d12compute kernels because we can't compile the + // assertion. For metal & d3d12compute we just inline the // luma computation. Y.clone_in(intm) .compute_at(intm.in(), y) From 188c1554fd9cc596e16fba42071edea3f3f8d602 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 3 Jun 2020 17:28:56 -0700 Subject: [PATCH 09/29] Touch From c5ced30d695c0aa9f714545f8ee39201d3dc8c8b Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Thu, 4 Jun 2020 14:36:41 -0400 Subject: [PATCH 10/29] Revert change --- src/runtime/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt index 664380749d43..f648aec33af8 100644 --- a/src/runtime/CMakeLists.txt +++ b/src/runtime/CMakeLists.txt @@ -165,8 +165,8 @@ foreach (i IN LISTS RUNTIME_CPP) set(SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/${i}.cpp") - set(RUNTIME_DEFINES -g -DCOMPILING_HALIDE_RUNTIME -DLLVM_VERSION=${LLVM_VERSION} -DBITS_${j}) - set(RUNTIME_DEFINES_debug -DDEBUG_RUNTIME ${RUNTIME_DEFINES}) + set(RUNTIME_DEFINES -DCOMPILING_HALIDE_RUNTIME -DLLVM_VERSION=${LLVM_VERSION} -DBITS_${j}) + set(RUNTIME_DEFINES_debug -g -DDEBUG_RUNTIME ${RUNTIME_DEFINES}) foreach (SUFFIX IN ITEMS "" "_debug") set(LL "initmod.${i}_${j}${SUFFIX}.ll") From 5f5b4a4b8a9b12b7f7b17a1ccc73c99fc409314e Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Thu, 4 Jun 2020 15:41:46 -0400 Subject: [PATCH 11/29] Fix nl_means by uniquifying global shared allocs. --- src/CodeGen_D3D12Compute_Dev.cpp | 77 ++++++++++++++++++++--- test/correctness/gpu_allocation_cache.cpp | 5 ++ 2 files changed, 72 insertions(+), 10 deletions(-) diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp index 377ebd36e4ab..c469f665a8d9 100644 --- a/src/CodeGen_D3D12Compute_Dev.cpp +++ b/src/CodeGen_D3D12Compute_Dev.cpp @@ -896,22 +896,79 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s, constants[i].size += constants[i - 1].size; } - // Find all the shared allocations and declare them at global scope. - class FindSharedAllocations : public IRVisitor { - using IRVisitor::visit; - void visit(const Allocate *op) override { - op->body.accept(this); + // Find all the shared allocations, uniquify their names, + // and declare them at global scope. + class FindSharedAllocationsAndUniquify : public IRMutator { + using IRMutator::visit; + Stmt visit(const Allocate *op) override { if (is_shared_allocation(op)) { - allocs.push_back(op); + // Because these will go in global scope, + // we need to ensure they have unique names. + std::string new_name = unique_name(op->name); + replacements[op->name] = new_name; + + std::vector new_extents; + for (size_t i = 0; i < op->extents.size(); i++) { + new_extents.push_back(mutate(op->extents[i])); + } + Stmt new_body = mutate(op->body); + Expr new_condition = mutate(op->condition); + Expr new_new_expr; + if (op->new_expr.defined()) { + new_new_expr = mutate(op->new_expr); + } + + Stmt new_alloc = Allocate::make(new_name, op->type, op->memory_type, new_extents, + std::move(new_condition), std::move(new_body), + std::move(new_new_expr), op->free_function); + + allocs.push_back(new_alloc); + replacements.erase(op->name); + return new_alloc; + } else { + return IRMutator::visit(op); } } + + Stmt visit(const Free *op) override { + if (replacements.count(op->name)) { + return Free::make(replacements[op->name]); + } else { + return IRMutator::visit(op); + } + } + + Expr visit(const Load *op) override { + if (replacements.count(op->name)) { + return Load::make(op->type, replacements[op->name], + std::move(mutate(op->index)), op->image, op->param, + std::move(mutate(op->predicate)), op->alignment); + } else { + return IRMutator::visit(op); + } + } + + Stmt visit(const Store *op) override { + if (replacements.count(op->name)) { + return Store::make(replacements[op->name], std::move(mutate(op->value)), + std::move(mutate(op->index)), op->param, + std::move(mutate(op->predicate)), op->alignment); + } else { + return IRMutator::visit(op); + } + } + + std::map replacements; friend class CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C; - vector allocs; + vector allocs; }; - FindSharedAllocations fsa; - s.accept(&fsa); + + FindSharedAllocationsAndUniquify fsa; + s = fsa.mutate(s); + uint32_t total_shared_bytes = 0; - for (const Allocate *op : fsa.allocs) { + for (Stmt sop : fsa.allocs) { + const Allocate *op = sop.as(); internal_assert(op->extents.size() == 1); internal_assert(op->type.lanes() == 1); // In D3D12/HLSL, only 32bit types (int/uint/float) are suppoerted (even diff --git a/test/correctness/gpu_allocation_cache.cpp b/test/correctness/gpu_allocation_cache.cpp index b2f748600797..6dc4ea75ae15 100644 --- a/test/correctness/gpu_allocation_cache.cpp +++ b/test/correctness/gpu_allocation_cache.cpp @@ -18,6 +18,11 @@ int main(int argc, char **argv) { printf("[SKIP] No GPU target enabled.\n"); return 0; } + if (target.has_feature(Target::D3D12Compute)) { + // https://github.com/halide/Halide/issues/5000 + printf("[SKIP] Allocation cache not yet implemented for D3D12Compute.\n"); + return 0; + } const int N = 30; Var x, y, xi, yi; From f12db6379938db41d29004c4741e0ae07f81116d Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Thu, 4 Jun 2020 17:06:32 -0400 Subject: [PATCH 12/29] Address review comments --- apps/camera_pipe/camera_pipe_generator.cpp | 2 +- src/CodeGen_D3D12Compute_Dev.cpp | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/apps/camera_pipe/camera_pipe_generator.cpp b/apps/camera_pipe/camera_pipe_generator.cpp index 4fedec0eac38..98cf0c2c8b13 100644 --- a/apps/camera_pipe/camera_pipe_generator.cpp +++ b/apps/camera_pipe/camera_pipe_generator.cpp @@ -464,7 +464,7 @@ void CameraPipe::generate() { int tile_y = 12; if (get_target().has_feature(Target::D3D12Compute)) { - // D3D12 can only utilize a limited amount of + // D3D12 SM 5.1 can only utilize a limited amount of // shared memory, so we use a slightly smaller // tile size. tile_x = 20; diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp index c469f665a8d9..2d82cf56834f 100644 --- a/src/CodeGen_D3D12Compute_Dev.cpp +++ b/src/CodeGen_D3D12Compute_Dev.cpp @@ -931,16 +931,18 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s, } Stmt visit(const Free *op) override { - if (replacements.count(op->name)) { - return Free::make(replacements[op->name]); + auto it = replacements.find(op->name); + if (it != replacements.end()) { + return Free::make(it->second); } else { return IRMutator::visit(op); } } Expr visit(const Load *op) override { - if (replacements.count(op->name)) { - return Load::make(op->type, replacements[op->name], + auto it = replacements.find(op->name); + if (it != replacements.end()) { + return Load::make(op->type, it->second, std::move(mutate(op->index)), op->image, op->param, std::move(mutate(op->predicate)), op->alignment); } else { @@ -949,8 +951,9 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s, } Stmt visit(const Store *op) override { - if (replacements.count(op->name)) { - return Store::make(replacements[op->name], std::move(mutate(op->value)), + auto it = replacements.find(op->name); + if (it != replacements.end()) { + return Store::make(it->second, std::move(mutate(op->value)), std::move(mutate(op->index)), op->param, std::move(mutate(op->predicate)), op->alignment); } else { From 57334f298f9e11d1608757436b82c11c30f4d973 Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Thu, 4 Jun 2020 18:05:22 -0400 Subject: [PATCH 13/29] Trailing whitespace --- apps/camera_pipe/camera_pipe_generator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/camera_pipe/camera_pipe_generator.cpp b/apps/camera_pipe/camera_pipe_generator.cpp index 98cf0c2c8b13..0eba09f12f3c 100644 --- a/apps/camera_pipe/camera_pipe_generator.cpp +++ b/apps/camera_pipe/camera_pipe_generator.cpp @@ -460,7 +460,7 @@ void CameraPipe::generate() { Var xi, yi, xii, xio; /* These tile factors obtain 1391us on a gtx 980. */ - int tile_x = 28; + int tile_x = 28; int tile_y = 12; if (get_target().has_feature(Target::D3D12Compute)) { From 8d5d3307387180a8015249b8c5b30333013015e1 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 4 Jun 2020 16:30:51 -0700 Subject: [PATCH 14/29] Don't move an rval --- src/CodeGen_D3D12Compute_Dev.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp index 2d82cf56834f..34cde476d60e 100644 --- a/src/CodeGen_D3D12Compute_Dev.cpp +++ b/src/CodeGen_D3D12Compute_Dev.cpp @@ -943,8 +943,8 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s, auto it = replacements.find(op->name); if (it != replacements.end()) { return Load::make(op->type, it->second, - std::move(mutate(op->index)), op->image, op->param, - std::move(mutate(op->predicate)), op->alignment); + mutate(op->index), op->image, op->param, + mutate(op->predicate), op->alignment); } else { return IRMutator::visit(op); } @@ -953,9 +953,9 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s, Stmt visit(const Store *op) override { auto it = replacements.find(op->name); if (it != replacements.end()) { - return Store::make(it->second, std::move(mutate(op->value)), - std::move(mutate(op->index)), op->param, - std::move(mutate(op->predicate)), op->alignment); + return Store::make(it->second, mutate(op->value), + mutate(op->index), op->param, + mutate(op->predicate), op->alignment); } else { return IRMutator::visit(op); } From 8eb2889b8828d277dc7ba4399d9459081c9a2014 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Thu, 4 Jun 2020 18:21:30 -0700 Subject: [PATCH 15/29] Touch From 4e96784728362370ad68577923a85ada7991bfde Mon Sep 17 00:00:00 2001 From: Shoaib Kamil Date: Fri, 5 Jun 2020 11:27:16 -0400 Subject: [PATCH 16/29] Fix all-positive-strides check for D3D to match OpenCL/CUDA. Also drive-by change to Metal since it has the same logic. --- src/runtime/d3d12compute.cpp | 2 +- src/runtime/metal.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index f4a171893a1e..ff3679951603 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -2289,7 +2289,7 @@ WEAK int halide_d3d12compute_device_malloc(void *user_context, halide_buffer_t * // Check all strides positive for (int i = 0; i < buf->dimensions; i++) { - halide_assert(user_context, buf->dim[i].stride > 0); + halide_assert(user_context, buf->dim[i].stride >= 0); } TRACEPRINT("allocating " << *buf << "\n"); diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp index c10c390fdd3e..758e0abb70ca 100644 --- a/src/runtime/metal.cpp +++ b/src/runtime/metal.cpp @@ -448,7 +448,7 @@ WEAK int halide_metal_device_malloc(void *user_context, halide_buffer_t *buf) { // Check all strides positive for (int i = 0; i < buf->dimensions; i++) { - halide_assert(user_context, buf->dim[i].stride > 0); + halide_assert(user_context, buf->dim[i].stride >= 0); } debug(user_context) << " allocating " << *buf << "\n"; From 52bf96d3367fcc152368aa6c3873487c72931468 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Sun, 7 Jun 2020 14:38:34 -0700 Subject: [PATCH 17/29] Large stack printer objects seem to cause problems with LLVM when debug info is on. Use alternative means. --- src/runtime/d3d12compute.cpp | 61 +++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index 220d7282506c..c1c4e26c0892 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -86,40 +86,37 @@ class StackPrinter : public Printer { static void *const user_context = NULL; // #if HALIDE_D3D12_TRACE -// it's better to use StackPrinter<> instead of Printer<> (debug(ctx)) when -// tracing calls because Printer<> calls 'halide_malloc()' which can fail; -// there's even a test for it (generator_aot_cleanup_on_error), simulating -// a halide_malloc failure, and user code should not affect debug-tracing -#define trace(x) StackPrinter<4096, BasicPrinter>(x) - -static volatile ScopedSpinLock::AtomicFlag indent_lock = 0; -static const char indent_pattern[] = " "; -static char indent[2048] = {}; -static int indent_end = 0; -#define TRACEINDENT ((const char *)indent) -#define TRACEPRINT(msg) trace(user_context) << TRACEINDENT << msg; +static volatile ScopedSpinLock::AtomicFlag trace_indent_lock = 0; +static const char trace_indent_pattern[] = " "; +static char trace_indent[4096] = {}; +static char trace_buf[4096] = {}; +static int trace_indent_end = 0; +#define TRACEINDENT ((const char *)trace_indent) +#define TRACEPRINT(msg) Printer(user_context, trace_buf) << TRACEINDENT << msg; struct TraceLogScope { - TraceLogScope() { - while (__atomic_test_and_set(&indent_lock, __ATOMIC_ACQUIRE)) { + void *user_context; + TraceLogScope(void *user_context, const char *function) : + user_context(user_context) { + Printer(user_context, trace_buf) << TRACEINDENT << "[@] " << function << "\n"; + while (__atomic_test_and_set(&trace_indent_lock, __ATOMIC_ACQUIRE)) { } - for (const char *p = indent_pattern; *p; ++p) { - indent[indent_end++] = *p; + for (const char *p = trace_indent_pattern; *p; ++p) { + trace_indent[trace_indent_end++] = *p; } - __atomic_clear(&indent_lock, __ATOMIC_RELEASE); + __atomic_clear(&trace_indent_lock, __ATOMIC_RELEASE); } ~TraceLogScope() { - while (__atomic_test_and_set(&indent_lock, __ATOMIC_ACQUIRE)) { + while (__atomic_test_and_set(&trace_indent_lock, __ATOMIC_ACQUIRE)) { } - for (const char *p = indent_pattern; *p; ++p) { - indent[--indent_end] = '\0'; + for (const char *p = trace_indent_pattern; *p; ++p) { + trace_indent[--trace_indent_end] = '\0'; } - TRACEPRINT("^^^\n"); - __atomic_clear(&indent_lock, __ATOMIC_RELEASE); + Printer(user_context, trace_buf) << TRACEINDENT << "^^^\n"; + __atomic_clear(&trace_indent_lock, __ATOMIC_RELEASE); } }; -#define TRACELOG \ - TRACEPRINT("[@]" << __FUNCTION__ << "\n"); \ - TraceLogScope trace_scope___; +#define TRACELOG TraceLogScope trace_scope___(user_context, __FUNCTION__); + #else #define TRACEINDENT "" #define TRACEPRINT(msg) @@ -240,6 +237,7 @@ D3D12TYPENAME(IDXGIOutput) template static bool D3DError(HRESULT result, ID3D12T *object, void *user_context, const char *message) { + TRACELOG; // HRESULT ERROR CODES: // D3D12: https://msdn.microsoft.com/en-us/library/windows/desktop/bb509553(v=vs.85).aspx // Win32: https://msdn.microsoft.com/en-us/library/windows/desktop/aa378137(v=vs.85).aspx @@ -1065,6 +1063,8 @@ WEAK void dispatch_threadgroups(d3d12_compute_command_list *cmdList, } WEAK d3d12_buffer new_buffer_resource(d3d12_device *device, size_t length, D3D12_HEAP_TYPE heaptype) { + TRACELOG; + D3D12_RESOURCE_DESC desc = {}; { desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; @@ -1627,10 +1627,11 @@ static void dump_shader(const char *source, ID3DBlob *compiler_msgs = NULL) { message = (const char *)compiler_msgs->GetBufferPointer(); } - StackPrinter<64 * 1024> dump; - dump(user_context) << TRACEINDENT << "D3DCompile(): " << message << "\n"; - dump(user_context) << TRACEINDENT << ">>> HLSL shader source dump <<<\n" - << source << "\n"; + char *buf = (char *)malloc(64 * 1024); + print(user_context, buf) << TRACEINDENT << "D3DCompile(): " << message << "\n" + << TRACEINDENT << ">>> HLSL shader source dump <<<\n" + << source << "\n"; + free(buf); } static d3d12_function *new_function_with_name(d3d12_device *device, d3d12_library *library, const char *name, size_t name_len, @@ -2495,6 +2496,8 @@ namespace { static int do_multidimensional_copy(d3d12_device *device, const device_copy &c, uint64_t src_offset, uint64_t dst_offset, int dimensions) { + TRACELOG; + if (dimensions == 0) { d3d12_buffer *dsrc = reinterpret_cast(c.src); d3d12_buffer *ddst = reinterpret_cast(c.dst); From a0359cb29ee700e0bb8ef9cdd2f3176375b2824c Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Sun, 7 Jun 2020 14:44:18 -0700 Subject: [PATCH 18/29] Make thread-safe --- src/runtime/d3d12compute.cpp | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index c1c4e26c0892..02cb1489fd40 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -87,32 +87,46 @@ static void *const user_context = NULL; // #if HALIDE_D3D12_TRACE static volatile ScopedSpinLock::AtomicFlag trace_indent_lock = 0; +#define TRACE_BUF_SIZE 4096 static const char trace_indent_pattern[] = " "; -static char trace_indent[4096] = {}; -static char trace_buf[4096] = {}; +static char trace_indent[TRACE_BUF_SIZE] = {}; +static char trace_buf[TRACE_BUF_SIZE] = {}; static int trace_indent_end = 0; #define TRACEINDENT ((const char *)trace_indent) -#define TRACEPRINT(msg) Printer(user_context, trace_buf) << TRACEINDENT << msg; +#define TRACEPRINT(msg) { \ + trace_scope___.lock(); \ + Printer(user_context, trace_buf) << TRACEINDENT << msg; \ + trace_scope___.unlock(); \ +} struct TraceLogScope { void *user_context; - TraceLogScope(void *user_context, const char *function) : - user_context(user_context) { - Printer(user_context, trace_buf) << TRACEINDENT << "[@] " << function << "\n"; + + void lock() { while (__atomic_test_and_set(&trace_indent_lock, __ATOMIC_ACQUIRE)) { } + } + + void unlock() { + __atomic_clear(&trace_indent_lock, __ATOMIC_RELEASE); + } + + TraceLogScope(void *user_context, const char *function) : + user_context(user_context) { + lock(); + Printer(user_context, trace_buf) << TRACEINDENT << "[@] " << function << "\n"; for (const char *p = trace_indent_pattern; *p; ++p) { trace_indent[trace_indent_end++] = *p; } - __atomic_clear(&trace_indent_lock, __ATOMIC_RELEASE); + unlock(); } + ~TraceLogScope() { - while (__atomic_test_and_set(&trace_indent_lock, __ATOMIC_ACQUIRE)) { - } + lock(); for (const char *p = trace_indent_pattern; *p; ++p) { trace_indent[--trace_indent_end] = '\0'; } - Printer(user_context, trace_buf) << TRACEINDENT << "^^^\n"; - __atomic_clear(&trace_indent_lock, __ATOMIC_RELEASE); + Printer(user_context, trace_buf) << TRACEINDENT << "^^^\n"; + unlock(); } }; #define TRACELOG TraceLogScope trace_scope___(user_context, __FUNCTION__); From c275f3aae6c0fbb9996ba624faa67f6c5020d6d5 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Sun, 7 Jun 2020 14:50:20 -0700 Subject: [PATCH 19/29] clang-format --- src/runtime/d3d12compute.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index 02cb1489fd40..4688b912b693 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -93,11 +93,12 @@ static char trace_indent[TRACE_BUF_SIZE] = {}; static char trace_buf[TRACE_BUF_SIZE] = {}; static int trace_indent_end = 0; #define TRACEINDENT ((const char *)trace_indent) -#define TRACEPRINT(msg) { \ - trace_scope___.lock(); \ +#define TRACEPRINT(msg) \ + { \ + trace_scope___.lock(); \ Printer(user_context, trace_buf) << TRACEINDENT << msg; \ - trace_scope___.unlock(); \ -} + trace_scope___.unlock(); \ + } struct TraceLogScope { void *user_context; @@ -110,8 +111,8 @@ struct TraceLogScope { __atomic_clear(&trace_indent_lock, __ATOMIC_RELEASE); } - TraceLogScope(void *user_context, const char *function) : - user_context(user_context) { + TraceLogScope(void *user_context, const char *function) + : user_context(user_context) { lock(); Printer(user_context, trace_buf) << TRACEINDENT << "[@] " << function << "\n"; for (const char *p = trace_indent_pattern; *p; ++p) { @@ -129,7 +130,7 @@ struct TraceLogScope { unlock(); } }; -#define TRACELOG TraceLogScope trace_scope___(user_context, __FUNCTION__); +#define TRACELOG TraceLogScope trace_scope___(user_context, __FUNCTION__); #else #define TRACEINDENT "" @@ -1078,7 +1079,7 @@ WEAK void dispatch_threadgroups(d3d12_compute_command_list *cmdList, WEAK d3d12_buffer new_buffer_resource(d3d12_device *device, size_t length, D3D12_HEAP_TYPE heaptype) { TRACELOG; - + D3D12_RESOURCE_DESC desc = {}; { desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; @@ -2511,7 +2512,7 @@ namespace { static int do_multidimensional_copy(d3d12_device *device, const device_copy &c, uint64_t src_offset, uint64_t dst_offset, int dimensions) { TRACELOG; - + if (dimensions == 0) { d3d12_buffer *dsrc = reinterpret_cast(c.src); d3d12_buffer *ddst = reinterpret_cast(c.dst); From 2e5309aed8cfc8fe3a9d182f561e806844b528de Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Mon, 8 Jun 2020 11:36:46 -0700 Subject: [PATCH 20/29] Delete StackPrinter and improve Printer instead --- src/runtime/d3d12compute.cpp | 132 ++++++++++++++++------------------- src/runtime/printer.h | 13 +++- 2 files changed, 70 insertions(+), 75 deletions(-) diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index 4688b912b693..c119a0eba243 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -85,68 +85,57 @@ class StackPrinter : public Printer { // in case there's no 'user_context' available in the scope of a function: static void *const user_context = NULL; // -#if HALIDE_D3D12_TRACE -static volatile ScopedSpinLock::AtomicFlag trace_indent_lock = 0; -#define TRACE_BUF_SIZE 4096 -static const char trace_indent_pattern[] = " "; -static char trace_indent[TRACE_BUF_SIZE] = {}; -static char trace_buf[TRACE_BUF_SIZE] = {}; -static int trace_indent_end = 0; -#define TRACEINDENT ((const char *)trace_indent) -#define TRACEPRINT(msg) \ - { \ - trace_scope___.lock(); \ - Printer(user_context, trace_buf) << TRACEINDENT << msg; \ - trace_scope___.unlock(); \ - } -struct TraceLogScope { - void *user_context; - void lock() { - while (__atomic_test_and_set(&trace_indent_lock, __ATOMIC_ACQUIRE)) { +// Trace and logging utilities for debugging. +#if HALIDE_D3D12_TRACE +static volatile ScopedSpinLock::AtomicFlag trace_lock = 0; +static char trace_buf[4096] = {}; +static int trace_indent = 0; + +struct trace : public Printer { + ScopedSpinLock lock; + trace() + : Printer(NULL, trace_buf), + lock(&trace_lock) { + for (int i = 0; i < trace_indent; i++) { + *this << " "; } } - - void unlock() { - __atomic_clear(&trace_indent_lock, __ATOMIC_RELEASE); - } - - TraceLogScope(void *user_context, const char *function) - : user_context(user_context) { - lock(); - Printer(user_context, trace_buf) << TRACEINDENT << "[@] " << function << "\n"; - for (const char *p = trace_indent_pattern; *p; ++p) { - trace_indent[trace_indent_end++] = *p; - } - unlock(); +}; +struct TraceScope { + TraceScope() { + ScopedSpinLock lock(&trace_lock); + trace_indent++; } - - ~TraceLogScope() { - lock(); - for (const char *p = trace_indent_pattern; *p; ++p) { - trace_indent[--trace_indent_end] = '\0'; - } - Printer(user_context, trace_buf) << TRACEINDENT << "^^^\n"; - unlock(); + ~TraceScope() { + ScopedSpinLock lock(&trace_lock); + trace_indent--; } }; -#define TRACELOG TraceLogScope trace_scope___(user_context, __FUNCTION__); + +#define TRACELOG \ + trace() << "[@] " << __FUNCTION__ << "\n"; \ + TraceScope trace_scope__; + +#define TRACEPRINT(msg) trace() << msg; #else -#define TRACEINDENT "" -#define TRACEPRINT(msg) +typedef SinkPrinter trace; #define TRACELOG +#define TRACEPRINT(msg) #endif // -#define ERRORLOG error(user_context) << TRACEINDENT // ^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^ -static const char *d3d12_debug_dump(); +static void d3d12_debug_dump(error &err); #define d3d12_halt(...) \ - ERRORLOG << __VA_ARGS__ << "\n" \ - << d3d12_debug_dump() \ - << "!!! HALT !!!\n" + do { \ + error err(NULL); \ + err << __VA_ARGS__ << "\n"; \ + d3d12_debug_dump(err); \ + err << "!!! HALT !!!\n"; \ + } while (0) void *d3d12_load_library(const char *name) { TRACELOG; @@ -178,7 +167,7 @@ void *d3d12_get_library_symbol(void *lib, const char *name) { #if HALIDE_D3D12_RENDERDOC #if HALIDE_D3D12_DEBUG_LAYER -#pragma message "RenderDoc might not work well alongside Dirct3D debug layers..." +#pragma message "RenderDoc might not work well alongside Direct3D debug layers..." #endif #define WIN32 #define RenderDocAssert(expr) halide_assert(user_context, expr) @@ -266,7 +255,7 @@ static bool D3DError(HRESULT result, ID3D12T *object, void *user_context, const return false; } -static DXGI_FORMAT FindD3D12FormatForHalideType(halide_type_t type) { +static DXGI_FORMAT FindD3D12FormatForHalideType(void *user_context, halide_type_t type) { // DXGI Formats: // https://msdn.microsoft.com/en-us/library/windows/desktop/bb173059(v=vs.85).aspx @@ -588,7 +577,7 @@ struct d3d12_profiler { UINT max_queries; }; -static size_t number_of_elements(const halide_buffer_t *buffer) { +static size_t number_of_elements(void *user_context, const halide_buffer_t *buffer) { // halide_buffer_t::number_of_elements() does not necessarily map to D3D12 // Buffer View 'NumElements' since the former does not account for "hidden" // elements in the stride regions. @@ -737,7 +726,7 @@ static const d3d12_buffer *peel_buffer(const struct halide_buffer_t *hbuffer) { return peel_buffer(const_cast(hbuffer)); } -WEAK int wrap_buffer(struct halide_buffer_t *hbuffer, d3d12_buffer *dbuffer) { +WEAK int wrap_buffer(void *user_context, struct halide_buffer_t *hbuffer, d3d12_buffer *dbuffer) { halide_assert(user_context, (hbuffer->device == 0)); if (hbuffer->device != 0) { return halide_error_code_device_wrap_native_failed; @@ -748,8 +737,8 @@ WEAK int wrap_buffer(struct halide_buffer_t *hbuffer, d3d12_buffer *dbuffer) { dbuffer->offset = 0; dbuffer->offsetInBytes = 0; dbuffer->sizeInBytes = hbuffer->size_in_bytes(); - dbuffer->elements = number_of_elements(hbuffer); - dbuffer->format = FindD3D12FormatForHalideType(hbuffer->type); + dbuffer->elements = number_of_elements(user_context, hbuffer); + dbuffer->format = FindD3D12FormatForHalideType(user_context, hbuffer->type); if (dbuffer->format == DXGI_FORMAT_UNKNOWN) { d3d12_halt("unsupported buffer element type: " << hbuffer->type); return halide_error_code_device_wrap_native_failed; @@ -1642,11 +1631,10 @@ static void dump_shader(const char *source, ID3DBlob *compiler_msgs = NULL) { message = (const char *)compiler_msgs->GetBufferPointer(); } - char *buf = (char *)malloc(64 * 1024); - print(user_context, buf) << TRACEINDENT << "D3DCompile(): " << message << "\n" - << TRACEINDENT << ">>> HLSL shader source dump <<<\n" - << source << "\n"; - free(buf); + Printer(user_context) + << "D3DCompile(): " << message << "\n" + << ">>> HLSL shader source dump <<<\n" + << source << "\n"; } static d3d12_function *new_function_with_name(d3d12_device *device, d3d12_library *library, const char *name, size_t name_len, @@ -1660,7 +1648,7 @@ static d3d12_function *new_function_with_name(d3d12_device *device, d3d12_librar // consult the compiled function cache in the library first: d3d12_function *function = NULL; - StackPrinter<256, StringStreamPrinter> key; + Printer key(NULL); key << name << "_(" << threadsX << "," << threadsY << "," << threadsZ << ")_[" << shared_mem_bytes << "]"; halide_assert(user_context, (key.size() < key.capacity() - 1)); // make sure key fits into the stream int not_found = library->cache.lookup(user_context, (const uint8_t *)key.str(), key.size(), &function); @@ -1675,7 +1663,7 @@ static d3d12_function *new_function_with_name(d3d12_device *device, d3d12_librar const char *source = library->source; int source_size = library->source_length; - StackPrinter<16, StringStreamPrinter> SS[4]; + Printer SS[4] = {NULL, NULL, NULL, NULL}; D3D_SHADER_MACRO pDefines[] = { {"__GROUPSHARED_SIZE_IN_BYTES", (SS[0] << shared_mem_bytes).str()}, {"__NUM_TREADS_X", (SS[1] << threadsX).str()}, @@ -2251,28 +2239,30 @@ WEAK int halide_d3d12compute_release_context(void *user_context) { } // extern "C" -static const char *d3d12_debug_dump() { +static void d3d12_debug_dump(error &err) { TRACELOG; if (!device) { - return "debug info not available: no device."; + err << "debug info not available: no device.\n"; + return; } halide_assert(user_context, (dxgiAdapter != NULL)); DXGI_ADAPTER_DESC1 desc = {}; if (FAILED(dxgiAdapter->GetDesc1(&desc))) { - return "Unable to retrieve information about the adapter.\n"; + err << "Unable to retrieve information about the adapter.\n"; + return; } // NOTE(marcos): this printer will leak, but that's fine since debug dump // is a panic mechanism that precedes an operational "halt": void *dump_buffer = d3d12_malloc(64 * 1024); if (!dump_buffer) { - return "Unable to allocate memory for the debug dump.\n"; + err << "Unable to allocate memory for the debug dump.\n"; + return; } - Printer dump(user_context, (char *)dump_buffer); - dump << "\n===== Debug Dump =====\n"; + err << "\n===== Debug Dump =====\n"; // simple conversion from Windows 16bit wchar to char: char Description[128]; @@ -2281,9 +2271,7 @@ static const char *d3d12_debug_dump() { } Description[127] = '\0'; - dump << "D3D12 Adapter: " << Description << "\n"; - - return dump.str(); + err << "D3D12 Adapter: " << Description << "\n"; } using namespace Halide::Runtime::Internal::D3D12Compute; @@ -2324,7 +2312,7 @@ WEAK int halide_d3d12compute_device_malloc(void *user_context, halide_buffer_t * return halide_error_code_device_malloc_failed; } - if (0 != wrap_buffer(buf, d3d12_buf)) { + if (0 != wrap_buffer(user_context, buf, d3d12_buf)) { d3d12_halt("D3D12: unable to wrap halide buffer and D3D12 buffer."); return halide_error_code_device_wrap_native_failed; } @@ -2835,7 +2823,7 @@ WEAK int halide_d3d12compute_run(void *user_context, #if HALIDE_D3D12_PROFILING uint64_t eps = (uint64_t)get_elapsed_time(profiler, ini, end); - StackPrinter<64>() << "kernel execution time: " << eps << "us.\n"; + Printer() << "kernel execution time: " << eps << "us.\n"; // TODO: keep some live performance stats in the d3d12_function object // (accumulate stats based on dispatch similarities -- e.g., blocksX|Y|Z) release_object(profiler); @@ -3117,7 +3105,7 @@ WEAK int halide_d3d12compute_wrap_buffer(void *user_context, struct halide_buffe sbuffer.type = d3d12_buffer::ReadWrite; sbuffer.state = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; - int ret = wrap_buffer(halide_buf, &sbuffer); + int ret = wrap_buffer(user_context, halide_buf, &sbuffer); if (ret != 0) { return ret; } diff --git a/src/runtime/printer.h b/src/runtime/printer.h index 045cce88302e..e174f77a2f71 100644 --- a/src/runtime/printer.h +++ b/src/runtime/printer.h @@ -32,10 +32,13 @@ class Printer { char *buf, *dst, *end; void *user_context; bool own_mem; + char scratch[length <= 256 ? length : 1]; Printer(void *ctx, char *mem = NULL) : user_context(ctx), own_mem(mem == NULL) { - buf = mem ? mem : (char *)halide_malloc(user_context, length); + buf = (mem ? mem : + length <= sizeof(scratch) ? scratch : + (char *)malloc(length)); dst = buf; if (dst) { end = buf + (length - 1); @@ -132,6 +135,10 @@ class Printer { return (uint64_t)(dst - buf); } + uint64_t capacity() const { + return length; + } + // Delete the last N characters void erase(int n) { if (dst) { @@ -165,8 +172,8 @@ class Printer { } } - if (own_mem) { - halide_free(user_context, buf); + if (own_mem && buf != scratch) { + free(buf); } } }; From 01b1a14f5213213f083d03d283f9eac43d88e2a3 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Mon, 8 Jun 2020 11:45:58 -0700 Subject: [PATCH 21/29] Fix #5012 --- test/error/atomics_gpu_mutex.cpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/test/error/atomics_gpu_mutex.cpp b/test/error/atomics_gpu_mutex.cpp index 1577fe639ae8..bdb9ad2fe864 100644 --- a/test/error/atomics_gpu_mutex.cpp +++ b/test/error/atomics_gpu_mutex.cpp @@ -4,21 +4,18 @@ using namespace Halide; int main(int argc, char **argv) { int img_size = 10000; - int hist_size = 7; - Func im, hist; + Func f; Var x; RDom r(0, img_size); - im(x) = (x * x) % hist_size; + f(x) = Tuple(1, 0); + f(r) = Tuple(f(r)[1] + 1, f(r)[0] + 1); - hist(x) = Tuple(0, 0); - hist(im(r)) += Tuple(1, 2); - - hist.compute_root(); + f.compute_root(); RVar ro, ri; - hist.update() + f.update() .atomic() .split(r, ro, ri, 8) .gpu_blocks(ro) @@ -28,7 +25,7 @@ int main(int argc, char **argv) { // and we don't allow GPU blocks on mutex locks since // it leads to deadlocks. // This should throw an error - Realization out = hist.realize(hist_size); + Realization out = f.realize(img_size); printf("Success!\n"); return 0; From 0d6fe34484ab36f90174dfe964fe727f72684e36 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Mon, 8 Jun 2020 11:50:24 -0700 Subject: [PATCH 22/29] Use if-else instead of nested ternary --- src/runtime/printer.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/runtime/printer.h b/src/runtime/printer.h index e174f77a2f71..1dd9301157d5 100644 --- a/src/runtime/printer.h +++ b/src/runtime/printer.h @@ -36,9 +36,14 @@ class Printer { Printer(void *ctx, char *mem = NULL) : user_context(ctx), own_mem(mem == NULL) { - buf = (mem ? mem : - length <= sizeof(scratch) ? scratch : - (char *)malloc(length)); + if (mem != NULL) { + buf = mem; + } else if (length <= sizeof(scratch)) { + buf = scratch; + } else { + buf = (char *)malloc(length); + } + dst = buf; if (dst) { end = buf + (length - 1); From 65a1bc29bc04d02fbf9c27df9c76707d46e6fc49 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Mon, 8 Jun 2020 12:18:02 -0700 Subject: [PATCH 23/29] Fix args to nvvm.shfl.up --- src/LowerWarpShuffles.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/LowerWarpShuffles.cpp b/src/LowerWarpShuffles.cpp index 344641f170fc..d42c47294f1b 100644 --- a/src/LowerWarpShuffles.cpp +++ b/src/LowerWarpShuffles.cpp @@ -592,9 +592,9 @@ class LowerWarpShuffles : public IRMutator { // this. Expr mask = (1 << bits) - 1; Expr down = Call::make(shuffle_type, "llvm.nvvm.shfl.down" + intrin_suffix, - {base_val, result[0], (1 << bits) - 1}, Call::PureExtern); + {base_val, result[0], mask}, Call::PureExtern); Expr up = Call::make(shuffle_type, "llvm.nvvm.shfl.up" + intrin_suffix, - {base_val, (1 << bits) - result[0], 0, mask}, Call::PureExtern); + {base_val, (1 << bits) - result[0], 0}, Call::PureExtern); Expr cond = (this_lane >= (1 << bits) - result[0]); Expr equiv = select(cond, up, down); shuffled = simplify(equiv, true, bounds); From 1bfb69869b18b1049c8d0bd86cc08c464de223d0 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Mon, 8 Jun 2020 14:39:42 -0700 Subject: [PATCH 24/29] Also fix OpenCL --- src/runtime/opencl.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index 5f7aece0cc93..488d47554418 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -696,20 +696,21 @@ WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr, err = clBuildProgram(program, 1, devices, options.str(), NULL, NULL); if (err != CL_SUCCESS) { - // Allocate an appropriately sized buffer for the build log. - char buffer[8192]; - - // Get build log - if (clGetProgramBuildInfo(program, dev, - CL_PROGRAM_BUILD_LOG, - sizeof(buffer), buffer, - NULL) == CL_SUCCESS) { - error(user_context) << "CL: clBuildProgram failed: " - << get_opencl_error_name(err) - << "\nBuild Log:\n" - << buffer << "\n"; - } else { - error(user_context) << "clGetProgramBuildInfo failed"; + { + // Allocate an appropriately sized buffer for the build log. + Printer p(user_context); + + p << "CL: clBuildProgram failed: " + << get_opencl_error_name(err) + << "\nBuild Log:\n"; + + // Get build log + if (clGetProgramBuildInfo(program, dev, + CL_PROGRAM_BUILD_LOG, + p.capacity() - p.size() - 1, p.dst, + NULL) != CL_SUCCESS) { + p << "clGetProgramBuildInfo failed"; + } } return err; @@ -720,7 +721,6 @@ WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr, uint64_t t_after = halide_current_time_ns(user_context); debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n"; #endif - return 0; } From 10166ad20b473da2e6e56e797ae996c165d6500b Mon Sep 17 00:00:00 2001 From: Alexei Colin Date: Mon, 8 Jun 2020 17:25:51 +0000 Subject: [PATCH 25/29] cmake: llvm: fix linking against LLVM shared lib Fixes the build with -DLLVM_USE_SHARED_LLVM_LIBRARY=ON to actually use the dynamic build (libLLVM.so). Relevant for LLVM builds with -DLLVM_BUILD_LLVM_DYLIB=ON. Tested with LLVM v10.0.0. There are two issues that prevent a build against LLVM shared lib: 1. cmake failure: CMake Error at /usr/lib/llvm/10/lib64/cmake/llvm/LLVM-Config.cmake:145 (message): Target NVPTX is not in the set of libraries. Call Stack (most recent call first): /usr/lib/llvm/10/lib64/cmake/llvm/LLVM-Config.cmake:270 (llvm_expand_pseudo_components) dependencies/llvm/CMakeLists.txt:141 (llvm_map_components_to_libnames) 2. missing -lLLVM on the link line Regarding issue 2: the result of missing -lLLVM on the link line can be either of the following scenarios both of which are wrong: (A) (in theory) a successful build but with all LLVM modules linked into libHalide.so, which is not what the user requested with LLVM_USE_SHARED_LLVM_LIBRARY=ON, (B) (observed) a broken libHalide.so, where linking of apps against it fails with undefined symbols errors related to target components (related to issue 1, see more below): /usr/lib/gcc/x86_64-pc-linux-gnu/10.1.0/../../../../x86_64-pc-linux-gnu/bin/ld: src/libHalide.so: undefined reference to `LLVMInitializeX86Target' Sidenote regarding (B), linking the app against -lLLVM makes the link succeed but fails at runtime, because then some static LLVM data is linked into the app binary twice, so components get loaded twice: : CommandLine Error: Option 'pm-max-devirt-iterations' registered more than once! LLVM ERROR: inconsistency in registered CommandLine options The above errors from (B) are symptomps that are not important, the fix is to just link libHalide.so against libLLVM.so. This commit provides two alternatives for achiving it: (a) use llvm_config() (b) woraround by manually doing what llvm_config() does I think the correct way is (a), however it cannot be used yet, because in lateset LLVM 10.0.0, llvm_config does not accept targets of type INTERFACE_LIBRARY (see attached patch for what it would take to add that support to LLVM). If/when LLVM starts supporting INTERFACE_LIBRARY, then (b) should be deleted in favor of (a). The problem with existing cmake code that tries to link against LLVM library(ies) was that it seemed to communicate the intent to LLVM-Config.cmake via LLVM_USE_SHARED var, but LLVM-Config doesn't use that var. Instead, the llvm_config method takes an argument option USE_SHARED. Regarding issue 1: appears to be caused by treating targets as components. Observed when LLVM is built into a shared library with: -DBUILD_SHARED_LIBS=OFF -DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON -DLLVM_DISTRIBUTION_COMPONENTS="comp1;comp2;..." -DLLVM_TARGETS_TO_BUILD="" -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD="X86;NVPTX;...." Note that LLVM_DYLIB_COMPONENTS is NOT set explicitly, and I am not sure what exactly it defaults to, but the shared lib (libLLVM.so does end up containing all components selected by LLVM_DISTRIBUTION_COMPONENTS and all selected targets. (Not that it matters, but this is the LLVM config used in the package recipe in the Gentoo distribution.) The selected LLVM components do get built as static archives (e.g. LLVMSupport.a, etc.) and do get linked into libLLVM.so and do get installed into the system, however, there the static archives for target libraries (e.g. LLVMX86TargetInfo.a, etc) get linked but do NOT get installed into the system. So, when Halide cmake script asks for the target libraries as components (in call to llvm_map_components_to_libnames), that call fails), presumably because the corresponding static archives for the target libraries do not exist in the system. However, when linking against the LLVM shared library (as opposed to static archives), it is not necessary to specify the targets as components on the link line. The target components are linked into the shared object, along with the rest of the components, and specifying just the shared object is sufficient. Note that llvm_config implementation specifies both the shared object, followed by static archives for sake of fallback, so I kept that same logic. It would work equally well to only list LLVM without the static objects for the case with LLVM_USE_SHARED_LLVM_LIBRARY=ON. For reference, to enable approach (a), the patch to add support for targets of type INTERFACE_LIBRARY in llvm_config() to LLVM v10.0.0: --- a/cmake/modules/LLVM-Config.cmake 2020-06-08 19:36:44.804248189 -0000 +++ b/cmake/modules/LLVM-Config.cmake 2020-06-08 19:37:23.191439807 -0000 @@ -87,7 +87,13 @@ endif() endif() - target_link_libraries(${executable} PRIVATE LLVM) + get_target_property(t ${executable} TYPE) + if(t STREQUAL "INTERFACE_LIBRARY") + set(link_mode "INTERFACE") + else() + set(link_mode "PRIVATE") + endif() + target_link_libraries(${executable} ${link_mode} LLVM) endif() explicit_llvm_config(${executable} ${link_components}) @@ -99,7 +105,7 @@ llvm_map_components_to_libnames(LIBRARIES ${link_components}) get_target_property(t ${executable} TYPE) - if(t STREQUAL "STATIC_LIBRARY") + if(t STREQUAL "STATIC_LIBRARY" OR t STREQUAL "INTERFACE_LIBRARY") target_link_libraries(${executable} INTERFACE ${LIBRARIES}) elseif(t STREQUAL "EXECUTABLE" OR t STREQUAL "SHARED_LIBRARY" OR t STREQUAL "MODULE_LIBRARY") target_link_libraries(${executable} PRIVATE ${LIBRARIES}) --- dependencies/llvm/CMakeLists.txt | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/dependencies/llvm/CMakeLists.txt b/dependencies/llvm/CMakeLists.txt index 05e73ccabb55..89a4723c939a 100644 --- a/dependencies/llvm/CMakeLists.txt +++ b/dependencies/llvm/CMakeLists.txt @@ -127,13 +127,23 @@ endif () ## # Finish setting up llvm library +# +# Ideally, we would use llvm_config (instead of hardcoding LLVM lib name below): +# if (LLVM_USE_SHARED_LLVM_LIBRARY) +# set(LLVM_USE_SHARED "USE_SHARED") +# endif() +# llvm_config(Halide_LLVM ${LLVM_USE_SHARED} ${LLVM_COMPONENTS}) +# However, llvm_config (LLVM 10.0) does not accept INTERFACE_LIBRARY targets, +# so the below code does what llvm_config() does, with the slight difference +# that we link exclusively to the shared library without fallback to static +# libraries for symbols not resolved by the shared library. ## if (LLVM_USE_SHARED_LLVM_LIBRARY) - set(LLVM_USE_SHARED "USE_SHARED") + set(LLVM_LIBNAMES LLVM) +else () + llvm_map_components_to_libnames(LLVM_LIBNAMES ${LLVM_COMPONENTS}) endif () - -llvm_map_components_to_libnames(LLVM_LIBNAMES ${LLVM_COMPONENTS}) target_link_libraries(Halide_LLVM INTERFACE ${LLVM_LIBNAMES}) ## From cbcbcfcbd5ad683f5ff73fdadc0d85daef599730 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 9 Jun 2020 17:15:32 -0700 Subject: [PATCH 26/29] Don't print so many parentheses in IRPrinter --- src/IRPrinter.cpp | 164 +++++++++++++++++++++++++++------------------- src/IRPrinter.h | 16 ++++- 2 files changed, 111 insertions(+), 69 deletions(-) diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index f32f87b91607..21bbf7d54a03 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -351,11 +351,17 @@ std::ostream &operator<<(std::ostream &out, const DimType &t) { } IRPrinter::IRPrinter(ostream &s) - : stream(s), indent(0) { + : stream(s) { s.setf(std::ios::fixed, std::ios::floatfield); } void IRPrinter::print(const Expr &ir) { + ScopedValue old(implicit_parens, false); + ir.accept(this); +} + +void IRPrinter::print_no_parens(const Expr &ir) { + ScopedValue old(implicit_parens, true); ir.accept(this); } @@ -365,7 +371,7 @@ void IRPrinter::print(const Stmt &ir) { void IRPrinter::print_list(const std::vector &exprs) { for (size_t i = 0; i < exprs.size(); i++) { - print(exprs[i]); + print_no_parens(exprs[i]); if (i < exprs.size() - 1) { stream << ", "; } @@ -442,129 +448,146 @@ void IRPrinter::visit(const Cast *op) { void IRPrinter::visit(const Variable *op) { if (!known_type.contains(op->name) && (op->type != Int(32))) { - stream << "(" << op->type << ")"; + // Handle types already have parens + if (op->type.is_handle()) { + stream << op->type; + } else { + stream << "(" << op->type << ")"; + } } stream << op->name; } +void IRPrinter::open() { + if (!implicit_parens) { + stream << "("; + } +} + +void IRPrinter::close() { + if (!implicit_parens) { + stream << ")"; + } +} + void IRPrinter::visit(const Add *op) { - stream << "("; + open(); print(op->a); stream << " + "; print(op->b); - stream << ")"; + close(); } void IRPrinter::visit(const Sub *op) { - stream << "("; + open(); print(op->a); stream << " - "; print(op->b); - stream << ")"; + close(); } void IRPrinter::visit(const Mul *op) { - stream << "("; + open(); print(op->a); stream << "*"; print(op->b); - stream << ")"; + close(); } void IRPrinter::visit(const Div *op) { - stream << "("; + open(); print(op->a); stream << "/"; print(op->b); - stream << ")"; + close(); } void IRPrinter::visit(const Mod *op) { - stream << "("; + open(); print(op->a); stream << " % "; print(op->b); - stream << ")"; + close(); } void IRPrinter::visit(const Min *op) { stream << "min("; - print(op->a); + print_no_parens(op->a); stream << ", "; - print(op->b); - stream << ")"; + print_no_parens(op->b); + close(); } void IRPrinter::visit(const Max *op) { stream << "max("; - print(op->a); + print_no_parens(op->a); stream << ", "; - print(op->b); - stream << ")"; + print_no_parens(op->b); + close(); } void IRPrinter::visit(const EQ *op) { - stream << "("; + open(); print(op->a); stream << " == "; print(op->b); - stream << ")"; + close(); } void IRPrinter::visit(const NE *op) { - stream << "("; + open(); print(op->a); stream << " != "; print(op->b); - stream << ")"; + close(); } void IRPrinter::visit(const LT *op) { - stream << "("; + open(); print(op->a); stream << " < "; print(op->b); - stream << ")"; + close(); } void IRPrinter::visit(const LE *op) { - stream << "("; + open(); print(op->a); stream << " <= "; print(op->b); - stream << ")"; + close(); } void IRPrinter::visit(const GT *op) { - stream << "("; + open(); print(op->a); stream << " > "; print(op->b); - stream << ")"; + close(); } void IRPrinter::visit(const GE *op) { - stream << "("; + open(); print(op->a); stream << " >= "; print(op->b); - stream << ")"; + close(); } void IRPrinter::visit(const And *op) { - stream << "("; + open(); print(op->a); stream << " && "; print(op->b); - stream << ")"; + close(); } void IRPrinter::visit(const Or *op) { - stream << "("; + open(); print(op->a); stream << " || "; print(op->b); - stream << ")"; + close(); } void IRPrinter::visit(const Not *op) { @@ -574,11 +597,11 @@ void IRPrinter::visit(const Not *op) { void IRPrinter::visit(const Select *op) { stream << "select("; - print(op->condition); + print_no_parens(op->condition); stream << ", "; - print(op->true_value); + print_no_parens(op->true_value); stream << ", "; - print(op->false_value); + print_no_parens(op->false_value); stream << ")"; } @@ -586,13 +609,13 @@ void IRPrinter::visit(const Load *op) { const bool has_pred = !is_one(op->predicate); const bool show_alignment = op->type.is_vector() && op->alignment.modulus > 1; if (has_pred) { - stream << "("; + open(); } if (!known_type.contains(op->name)) { stream << "(" << op->type << ")"; } stream << op->name << "["; - print(op->index); + print_no_parens(op->index); if (show_alignment) { stream << " aligned(" << op->alignment.modulus << ", " << op->alignment.remainder << ")"; } @@ -600,21 +623,21 @@ void IRPrinter::visit(const Load *op) { if (has_pred) { stream << " if "; print(op->predicate); - stream << ")"; + close(); } } void IRPrinter::visit(const Ramp *op) { stream << "ramp("; - print(op->base); + print_no_parens(op->base); stream << ", "; - print(op->stride); + print_no_parens(op->stride); stream << ", " << op->lanes << ")"; } void IRPrinter::visit(const Broadcast *op) { stream << "x" << op->lanes << "("; - print(op->value); + print_no_parens(op->value); stream << ")"; } @@ -622,7 +645,11 @@ void IRPrinter::visit(const Call *op) { // TODO: Print indication of C vs C++? if (!known_type.contains(op->name) && (op->type != Int(32))) { - stream << "(" << op->type << ")"; + if (op->type.is_handle()) { + stream << op->type; // Already has parens + } else { + stream << "(" << op->type << ")"; + } } stream << op->name << "("; print_list(op->args); @@ -631,17 +658,18 @@ void IRPrinter::visit(const Call *op) { void IRPrinter::visit(const Let *op) { ScopedBinding<> bind(known_type, op->name); - stream << "(let " << op->name << " = "; + open(); + stream << "let " << op->name << " = "; print(op->value); stream << " in "; print(op->body); - stream << ")"; + close(); } void IRPrinter::visit(const LetStmt *op) { ScopedBinding<> bind(known_type, op->name); stream << get_indent() << "let " << op->name << " = "; - print(op->value); + print_no_parens(op->value); stream << "\n"; print(op->body); @@ -649,9 +677,9 @@ void IRPrinter::visit(const LetStmt *op) { void IRPrinter::visit(const AssertStmt *op) { stream << get_indent() << "assert("; - print(op->condition); + print_no_parens(op->condition); stream << ", "; - print(op->message); + print_no_parens(op->message); stream << ")\n"; } @@ -671,9 +699,9 @@ void IRPrinter::visit(const ProducerConsumer *op) { void IRPrinter::visit(const For *op) { ScopedBinding<> bind(known_type, op->name); stream << get_indent() << op->for_type << op->device_api << " (" << op->name << ", "; - print(op->min); + print_no_parens(op->min); stream << ", "; - print(op->extent); + print_no_parens(op->extent); stream << ") {\n"; indent++; @@ -685,9 +713,9 @@ void IRPrinter::visit(const For *op) { void IRPrinter::visit(const Acquire *op) { stream << get_indent() << "acquire ("; - print(op->semaphore); + print_no_parens(op->semaphore); stream << ", "; - print(op->count); + print_no_parens(op->count); stream << ") {\n"; indent++; print(op->body); @@ -699,13 +727,13 @@ void IRPrinter::print_lets(const Let *let) { stream << get_indent(); ScopedBinding<> bind(known_type, let->name); stream << "let " << let->name << " = "; - print(let->value); + print_no_parens(let->value); stream << " in\n"; if (const Let *next = let->body.as()) { print_lets(next); } else { stream << get_indent(); - print(let->body); + print_no_parens(let->body); stream << "\n"; } } @@ -716,13 +744,13 @@ void IRPrinter::visit(const Store *op) { const bool show_alignment = op->value.type().is_vector() && (op->alignment.modulus > 1); if (has_pred) { stream << "predicate ("; - print(op->predicate); + print_no_parens(op->predicate); stream << ")\n"; indent++; stream << get_indent(); } stream << op->name << "["; - print(op->index); + print_no_parens(op->index); if (show_alignment) { stream << " aligned(" << op->alignment.modulus @@ -738,7 +766,7 @@ void IRPrinter::visit(const Store *op) { indent -= 2; } else { // Just print the value in-line - print(op->value); + print_no_parens(op->value); } stream << "\n"; if (has_pred) { @@ -779,7 +807,7 @@ void IRPrinter::visit(const Allocate *op) { if (op->new_expr.defined()) { stream << "\n"; stream << get_indent() << " custom_new { "; - print(op->new_expr); + print_no_parens(op->new_expr); stream << " }"; } if (!op->free_function.empty()) { @@ -800,9 +828,9 @@ void IRPrinter::visit(const Realize *op) { stream << get_indent() << "realize " << op->name << "("; for (size_t i = 0; i < op->bounds.size(); i++) { stream << "["; - print(op->bounds[i].min); + print_no_parens(op->bounds[i].min); stream << ", "; - print(op->bounds[i].extent); + print_no_parens(op->bounds[i].extent); stream << "]"; if (i < op->bounds.size() - 1) stream << ", "; } @@ -828,7 +856,7 @@ void IRPrinter::visit(const Prefetch *op) { const bool has_cond = !is_one(op->condition); if (has_cond) { stream << "if ("; - print(op->condition); + print_no_parens(op->condition); stream << ") {\n"; indent++; stream << get_indent(); @@ -836,9 +864,9 @@ void IRPrinter::visit(const Prefetch *op) { stream << "prefetch " << op->name << "("; for (size_t i = 0; i < op->bounds.size(); i++) { stream << "["; - print(op->bounds[i].min); + print_no_parens(op->bounds[i].min); stream << ", "; - print(op->bounds[i].extent); + print_no_parens(op->bounds[i].extent); stream << "]"; if (i < op->bounds.size() - 1) stream << ", "; } @@ -880,7 +908,7 @@ void IRPrinter::visit(const IfThenElse *op) { stream << get_indent(); while (1) { stream << "if ("; - print(op->condition); + print_no_parens(op->condition); stream << ") {\n"; indent++; print(op->then_case); @@ -907,7 +935,7 @@ void IRPrinter::visit(const IfThenElse *op) { void IRPrinter::visit(const Evaluate *op) { stream << get_indent(); - print(op->value); + print_no_parens(op->value); stream << "\n"; } @@ -936,7 +964,7 @@ void IRPrinter::visit(const Shuffle *op) { print_list(op->vectors); stream << ", "; for (size_t i = 0; i < op->indices.size(); i++) { - print(op->indices[i]); + print_no_parens(op->indices[i]); if (i < op->indices.size() - 1) { stream << ", "; } diff --git a/src/IRPrinter.h b/src/IRPrinter.h index f5ed3186dffe..b7c78890f4f3 100644 --- a/src/IRPrinter.h +++ b/src/IRPrinter.h @@ -101,6 +101,9 @@ class IRPrinter : public IRVisitor { /** emit an expression on the output stream */ void print(const Expr &); + /** Emit an expression on the output stream without enclosing parens */ + void print_no_parens(const Expr &); + /** emit a statement on the output stream */ void print(const Stmt &); @@ -120,7 +123,18 @@ class IRPrinter : public IRVisitor { /** The current indentation level, useful for pretty-printing * statements */ - int indent; + int indent = 0; + + /** Certain expressions do not need parens around them, e.g. the + * args to a call are already separated by commas and a + * surrounding set of parens. */ + bool implicit_parens = false; + + /** Either emits "(" or "", depending on the value of implicit_parens */ + void open(); + + /** Either emits ")" or "", depending on the value of implicit_parens */ + void close(); /** The symbols whose types can be inferred from values printed * already. */ From 3ca53319e9b1c3a83403a59dda1d78a9dc1bc4f4 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 9 Jun 2020 17:19:58 -0700 Subject: [PATCH 27/29] Fix test --- src/IRPrinter.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index 21bbf7d54a03..c2d7706e3d83 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -201,15 +201,15 @@ void IRPrinter::test() { std::string correct_source = "allocate buf[float32 * 1023] in Stack\n" "let y = 17\n" - "assert((y >= 3), halide_error_param_too_small_i64(\"y\", y, 3))\n" + "assert(y >= 3, halide_error_param_too_small_i64(\"y\", y, 3))\n" "produce buf {\n" - " parallel (x, -2, (y + 2)) {\n" - " buf[(y - 1)] = ((x*17)/(x - 3))\n" + " parallel (x, -2, y + 2) {\n" + " buf[y - 1] = (x*17)/(x - 3)\n" " }\n" "}\n" "consume buf {\n" " vectorized (x, 0, y) {\n" - " out[x] = (buf((x % 3)) + 1)\n" + " out[x] = buf(x % 3) + 1\n" " }\n" "}\n"; From a9ab983cb84ce13b48cd0614b1654af89c665a0f Mon Sep 17 00:00:00 2001 From: Z Stern Date: Tue, 9 Jun 2020 23:20:01 -0700 Subject: [PATCH 28/29] Small wording improvements. --- Makefile | 2 +- README_webassembly.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index c04983dfeb02..6cd9caef5d43 100644 --- a/Makefile +++ b/Makefile @@ -1247,7 +1247,7 @@ GENERATOR_AOTWASM_TESTS := $(filter-out generator_aotwasm_memory_profiler_mandel test_aotwasm_generator: $(GENERATOR_AOTWASM_TESTS) # This is just a test to ensure than RunGen builds and links for a critical mass of Generators; -# not all will work directly (e.g. due to missing define_externs at link time), so we blacklist +# not all will work directly (e.g. due to missing define_externs at link time), so we disable # those known to be broken for plausible reasons. GENERATOR_BUILD_RUNGEN_TESTS = $(GENERATOR_EXTERNAL_TEST_GENERATOR:$(ROOT_DIR)/test/generator/%_generator.cpp=$(FILTERS_DIR)/%.rungen) GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/async_parallel.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) diff --git a/README_webassembly.md b/README_webassembly.md index 58a9e4759040..29d9cb20c5c9 100644 --- a/README_webassembly.md +++ b/README_webassembly.md @@ -87,7 +87,7 @@ Note that while some of these limitations may be improved in the future, some are effectively intrinsic to the nature of this problem. Realistically, this JIT implementation is intended solely for running Halide self-tests (and even then, a number of them are fundamentally impractical to support in a hosted-Wasm -environment and are blacklisted). +environment and are disabled). In sum: don't plan on using Halide JIT mode with Wasm unless you are working on the Halide library itself. @@ -155,7 +155,7 @@ will), you need to install Emscripten and a shell for running wasm+js code - To run the AOT tests, set `HL_TARGET=wasm-32-wasmrt` and build the `test_aotwasm_generator` target. (Note that the normal AOT tests won't run usefully with this target, as extra logic to run under a wasm-enabled shell is - required, and some tests are blacklisted.) + required, and some tests are disabled.) # Running benchmarks From 9cec5a5e8a4d1b26a0cb51d06586c95929d93d63 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Wed, 10 Jun 2020 10:20:36 -0700 Subject: [PATCH 29/29] New simplifier rules necessary for the gpu autoscheduler --- src/IRPrinter.cpp | 4 ++-- src/Simplify_Add.cpp | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index c2d7706e3d83..3b0b967e05f6 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -515,7 +515,7 @@ void IRPrinter::visit(const Min *op) { print_no_parens(op->a); stream << ", "; print_no_parens(op->b); - close(); + stream << ")"; } void IRPrinter::visit(const Max *op) { @@ -523,7 +523,7 @@ void IRPrinter::visit(const Max *op) { print_no_parens(op->a); stream << ", "; print_no_parens(op->b); - close(); + stream << ")"; } void IRPrinter::visit(const EQ *op) { diff --git a/src/Simplify_Add.cpp b/src/Simplify_Add.cpp index 854f6392ce1d..2269a54cea7d 100644 --- a/src/Simplify_Add.cpp +++ b/src/Simplify_Add.cpp @@ -117,7 +117,22 @@ Expr Simplify::visit(const Add *op, ExprInfo *bounds) { rewrite(max(x, y + c0) + c1, max(x + c1, y), c0 + c1 == 0) || rewrite(max(y + c0, x) + c1, max(y, x + c1), c0 + c1 == 0) || rewrite(max(x, y) + min(x, y), x + y) || - rewrite(max(x, y) + min(y, x), x + y))) || + rewrite(max(x, y) + min(y, x), x + y) || + + rewrite(min(x, y + (z*c0)) + (z*c1), min(x + (z*c1), y), (c0 + c1) == 0) || + rewrite(min(x, (y*c0) + z) + (y*c1), min(x + (y*c1), z), (c0 + c1) == 0) || + rewrite(min(x, y*c0) + (y*c1), min(x + (y*c1), 0), (c0 + c1) == 0) || + rewrite(min(x + (y*c0), z) + (y*c1), min((y*c1) + z, x), (c0 + c1) == 0) || + rewrite(min((x*c0) + y, z) + (x*c1), min(y, (x*c1) + z), (c0 + c1) == 0) || + rewrite(min(x*c0, y) + (x*c1), min((x*c1) + y, 0), (c0 + c1) == 0) || + rewrite(max(x, y + (z*c0)) + (z*c1), max(x + (z*c1), y), (c0 + c1) == 0) || + rewrite(max(x, (y*c0) + z) + (y*c1), max(x + (y*c1), z), (c0 + c1) == 0) || + rewrite(max(x, y*c0) + (y*c1), max(x + (y*c1), 0), (c0 + c1) == 0) || + rewrite(max(x + (y*c0), z) + (y*c1), max(x, (y*c1) + z), (c0 + c1) == 0) || + rewrite(max((x*c0) + y, z) + (x*c1), max((x*c1) + z, y), (c0 + c1) == 0) || + rewrite(max(x*c0, y) + (x*c1), max((x*c1) + y, 0), (c0 + c1) == 0) || + + false)) || (no_overflow_int(op->type) && (rewrite((x/c0)*c0 + x%c0, x, c0 != 0) || rewrite((z + x/c0)*c0 + x%c0, z*c0 + x, c0 != 0) ||