diff --git a/base/timing.jl b/base/timing.jl index ac19e9a2e2ba8..ac932d01be121 100644 --- a/base/timing.jl +++ b/base/timing.jl @@ -55,6 +55,8 @@ function gc_alloc_count(diff::GC_Diff) diff.malloc + diff.realloc + diff.poolalloc + diff.bigalloc end +# cumulative total time spent on compilation +cumulative_compile_time_ns() = ccall(:jl_cumulative_compile_time_ns, UInt64, ()) # total time spend in garbage collection, in nanoseconds gc_time_ns() = ccall(:jl_gc_total_hrtime, UInt64, ()) @@ -102,7 +104,7 @@ function format_bytes(bytes) # also used by InteractiveUtils end end -function time_print(elapsedtime, bytes=0, gctime=0, allocs=0) +function time_print(elapsedtime, bytes=0, gctime=0, allocs=0, compile_time=0) timestr = Ryu.writefixed(Float64(elapsedtime/1e9), 6) length(timestr) < 10 && print(" "^(10 - length(timestr))) print(timestr, " seconds") @@ -118,15 +120,18 @@ function time_print(elapsedtime, bytes=0, gctime=0, allocs=0) if gctime > 0 print(", ", Ryu.writefixed(Float64(100*gctime/elapsedtime), 2), "% gc time") end + if compile_time > 0 + print(", ", Ryu.writefixed(Float64(100*compile_time/elapsedtime), 2), "% compilation time") + end if bytes != 0 || allocs != 0 print(")") end nothing end -function timev_print(elapsedtime, diff::GC_Diff) +function timev_print(elapsedtime, diff::GC_Diff, compile_time) allocs = gc_alloc_count(diff) - time_print(elapsedtime, diff.allocd, diff.total_time, allocs) + time_print(elapsedtime, diff.allocd, diff.total_time, allocs, compile_time) print("\nelapsed time (ns): $elapsedtime\n") padded_nonzero_print(diff.total_time, "gc time (ns)") padded_nonzero_print(diff.allocd, "bytes allocated") @@ -144,7 +149,8 @@ end A macro to execute an expression, printing the time it took to execute, the number of allocations, and the total number of bytes its execution caused to be allocated, before -returning the value of the expression. +returning the value of the expression. Any time spent garbage collecting (gc) or +compiling is shown as a percentage. See also [`@timev`](@ref), [`@timed`](@ref), [`@elapsed`](@ref), and [`@allocated`](@ref). @@ -155,8 +161,13 @@ See also [`@timev`](@ref), [`@timed`](@ref), [`@elapsed`](@ref), and reduce noise. ```julia-repl -julia> @time rand(10^6); - 0.001525 seconds (7 allocations: 7.630 MiB) +julia> x = rand(10,10); + +julia> @time x * x; + 0.606588 seconds (2.19 M allocations: 116.555 MiB, 3.75% gc time, 99.94% compilation time) + +julia> @time x * x; + 0.000009 seconds (1 allocation: 896 bytes) julia> @time begin sleep(0.3) @@ -170,12 +181,14 @@ macro time(ex) quote while false; end # compiler heuristic: compile this block (alter this if the heuristic changes) local stats = gc_num() + local compile_elapsedtime = cumulative_compile_time_ns() local elapsedtime = time_ns() local val = $(esc(ex)) elapsedtime = time_ns() - elapsedtime + compile_elapsedtime = cumulative_compile_time_ns() - compile_elapsedtime local diff = GC_Diff(gc_num(), stats) time_print(elapsedtime, diff.allocd, diff.total_time, - gc_alloc_count(diff)) + gc_alloc_count(diff), compile_elapsedtime) println() val end @@ -192,22 +205,36 @@ See also [`@time`](@ref), [`@timed`](@ref), [`@elapsed`](@ref), and [`@allocated`](@ref). ```julia-repl -julia> @timev rand(10^6); - 0.001006 seconds (7 allocations: 7.630 MiB) -elapsed time (ns): 1005567 -bytes allocated: 8000256 -pool allocs: 6 -malloc() calls: 1 +julia> x = rand(10,10); + +julia> @timev x * x; + 0.546770 seconds (2.20 M allocations: 116.632 MiB, 4.23% gc time, 99.94% compilation time) +elapsed time (ns): 546769547 +gc time (ns): 23115606 +bytes allocated: 122297811 +pool allocs: 2197930 +non-pool GC allocs:1327 +malloc() calls: 36 +realloc() calls: 5 +GC pauses: 3 + +julia> @timev x * x; + 0.000010 seconds (1 allocation: 896 bytes) +elapsed time (ns): 9848 +bytes allocated: 896 +pool allocs: 1 ``` """ macro timev(ex) quote while false; end # compiler heuristic: compile this block (alter this if the heuristic changes) local stats = gc_num() + local compile_elapsedtime = cumulative_compile_time_ns() local elapsedtime = time_ns() local val = $(esc(ex)) elapsedtime = time_ns() - elapsedtime - timev_print(elapsedtime, GC_Diff(gc_num(), stats)) + compile_elapsedtime = cumulative_compile_time_ns() - compile_elapsedtime + timev_print(elapsedtime, GC_Diff(gc_num(), stats), compile_elapsedtime) val end end diff --git a/doc/src/manual/performance-tips.md b/doc/src/manual/performance-tips.md index 7039daec30a50..18787dd2f942c 100644 --- a/doc/src/manual/performance-tips.md +++ b/doc/src/manual/performance-tips.md @@ -70,7 +70,7 @@ julia> function sum_global() end; julia> @time sum_global() - 0.017705 seconds (15.28 k allocations: 694.484 KiB) + 0.009639 seconds (7.36 k allocations: 300.310 KiB, 98.32% compilation time) 496.84883432553846 julia> @time sum_global() @@ -106,15 +106,15 @@ julia> function sum_arg(x) end; julia> @time sum_arg(x) - 0.007701 seconds (821 allocations: 43.059 KiB) + 0.006202 seconds (4.18 k allocations: 217.860 KiB, 99.72% compilation time) 496.84883432553846 julia> @time sum_arg(x) - 0.000006 seconds (5 allocations: 176 bytes) + 0.000005 seconds (1 allocation: 16 bytes) 496.84883432553846 ``` -The 5 allocations seen are from running the `@time` macro itself in global scope. If we instead run +The 1 allocation seen is from running the `@time` macro itself in global scope. If we instead run the timing in a function, we can see that indeed no allocations are performed: ```jldoctest sumarg; filter = r"[0-9\.]+ seconds" diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 1e49eba98f03f..d6bf220ca73a1 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -67,6 +67,13 @@ void jl_jit_globals(std::map &globals) } } +static uint64_t cumulative_compile_time = 0; + +extern "C" JL_DLLEXPORT +uint64_t jl_cumulative_compile_time_ns() { + return cumulative_compile_time; +} + // this generates llvm code for the lambda info // and adds the result to the jitlayers // (and the shadow module), @@ -210,6 +217,7 @@ extern "C" JL_DLLEXPORT void jl_compile_extern_c(void *llvmmod, void *p, void *sysimg, jl_value_t *declrt, jl_value_t *sigt) { JL_LOCK(&codegen_lock); + uint64_t compiler_start_time = jl_hrtime(); jl_codegen_params_t params; jl_codegen_params_t *pparams = (jl_codegen_params_t*)p; if (pparams == NULL) @@ -228,6 +236,7 @@ void jl_compile_extern_c(void *llvmmod, void *p, void *sysimg, jl_value_t *declr if (llvmmod == NULL) jl_add_to_ee(std::unique_ptr(into)); } + cumulative_compile_time += (jl_hrtime() - compiler_start_time); JL_UNLOCK(&codegen_lock); } @@ -251,8 +260,10 @@ void jl_extern_c(jl_value_t *declrt, jl_tupletype_t *sigt) if (!jl_is_concrete_type(declrt) || jl_is_kind(declrt)) jl_error("@ccallable: return type must be concrete and correspond to a C type"); JL_LOCK(&codegen_lock); + uint64_t compiler_start_time = jl_hrtime(); if (!jl_type_mappable_to_c(declrt)) jl_error("@ccallable: return type doesn't correspond to a C type"); + cumulative_compile_time += (jl_hrtime() - compiler_start_time); JL_UNLOCK(&codegen_lock); // validate method signature @@ -281,6 +292,7 @@ extern "C" jl_code_instance_t *jl_generate_fptr(jl_method_instance_t *mi JL_PROPAGATES_ROOT, size_t world) { JL_LOCK(&codegen_lock); // also disables finalizers, to prevent any unexpected recursion + uint64_t compiler_start_time = jl_hrtime(); // if we don't have any decls already, try to generate it now jl_code_info_t *src = NULL; JL_GC_PUSH1(&src); @@ -317,6 +329,7 @@ jl_code_instance_t *jl_generate_fptr(jl_method_instance_t *mi JL_PROPAGATES_ROOT else { codeinst = NULL; } + cumulative_compile_time += (jl_hrtime() - compiler_start_time); JL_UNLOCK(&codegen_lock); JL_GC_POP(); return codeinst; @@ -329,6 +342,7 @@ void jl_generate_fptr_for_unspecialized(jl_code_instance_t *unspec) return; } JL_LOCK(&codegen_lock); + uint64_t compiler_start_time = jl_hrtime(); if (unspec->invoke == NULL) { jl_code_info_t *src = NULL; JL_GC_PUSH1(&src); @@ -355,6 +369,7 @@ void jl_generate_fptr_for_unspecialized(jl_code_instance_t *unspec) } JL_GC_POP(); } + cumulative_compile_time += (jl_hrtime() - compiler_start_time); JL_UNLOCK(&codegen_lock); // Might GC } @@ -376,6 +391,7 @@ jl_value_t *jl_dump_method_asm(jl_method_instance_t *mi, size_t world, // (using sentinel value `1` instead) // so create an exception here so we can print pretty our lies JL_LOCK(&codegen_lock); // also disables finalizers, to prevent any unexpected recursion + uint64_t compiler_start_time = jl_hrtime(); specfptr = (uintptr_t)codeinst->specptr.fptr; if (specfptr == 0) { jl_code_info_t *src = jl_type_infer(mi, world, 0); @@ -399,6 +415,7 @@ jl_value_t *jl_dump_method_asm(jl_method_instance_t *mi, size_t world, } JL_GC_POP(); } + cumulative_compile_time += (jl_hrtime() - compiler_start_time); JL_UNLOCK(&codegen_lock); } if (specfptr != 0)