diff --git a/NEWS.md b/NEWS.md index 92790c54e5b35..db30021099233 100644 --- a/NEWS.md +++ b/NEWS.md @@ -58,6 +58,10 @@ Multi-threading changes An interactive task desires low latency and implicitly agrees to be short duration or to yield frequently. Interactive tasks will run on interactive threads, if any are specified when Julia is started ([#42302]). +* Threads started outside the Julia runtime (e.g. from C or Java) can now become able to + call into Julia code by calling `jl_adopt_thread`. This is done automatically when + entering Julia code via `cfunction` or a `@ccallable` entry point. As a consequence, the + number of threads can now change during execution ([#46609]). Build system changes -------------------- diff --git a/base/deprecated.jl b/base/deprecated.jl index 87fc670cd594a..a2f5555a40da4 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -336,4 +336,34 @@ function setproperty!(ci::CodeInfo, s::Symbol, v) return setfield!(ci, s, convert(fieldtype(CodeInfo, s), v)) end +@eval Threads nthreads() = threadpoolsize() + +@eval Threads begin + """ + resize_nthreads!(A, copyvalue=A[1]) + + Resize the array `A` to length [`nthreads()`](@ref). Any new + elements that are allocated are initialized to `deepcopy(copyvalue)`, + where `copyvalue` defaults to `A[1]`. + + This is typically used to allocate per-thread variables, and + should be called in `__init__` if `A` is a global constant. + + !!! warning + + This function is deprecated, since as of Julia v1.9 the number of + threads can change at run time. Instead, per-thread state should be + created as needed based on the thread id of the caller. + """ + function resize_nthreads!(A::AbstractVector, copyvalue=A[1]) + nthr = nthreads() + nold = length(A) + resize!(A, nthr) + for i = nold+1:nthr + A[i] = deepcopy(copyvalue) + end + return A + end +end + # END 1.9 deprecations diff --git a/base/partr.jl b/base/partr.jl index a4cfcb60fe520..c5bb6603d53af 100644 --- a/base/partr.jl +++ b/base/partr.jl @@ -2,7 +2,7 @@ module Partr -using ..Threads: SpinLock, nthreads, threadid +using ..Threads: SpinLock, maxthreadid, threadid # a task minheap mutable struct taskheap diff --git a/base/pcre.jl b/base/pcre.jl index d689e9be29113..7597c1217ca9e 100644 --- a/base/pcre.jl +++ b/base/pcre.jl @@ -29,7 +29,7 @@ THREAD_MATCH_CONTEXTS::Vector{Ptr{Cvoid}} = [C_NULL] PCRE_COMPILE_LOCK = nothing _tid() = Int(ccall(:jl_threadid, Int16, ())) + 1 -_nth() = Int(unsafe_load(cglobal(:jl_n_threads, Cint))) +_mth() = Int(Core.Intrinsics.atomic_pointerref(cglobal(:jl_n_threads, Cint), :acquire)) function get_local_match_context() tid = _tid() @@ -41,7 +41,7 @@ function get_local_match_context() try ctxs = THREAD_MATCH_CONTEXTS if length(ctxs) < tid - global THREAD_MATCH_CONTEXTS = ctxs = copyto!(fill(C_NULL, _nth()), ctxs) + global THREAD_MATCH_CONTEXTS = ctxs = copyto!(fill(C_NULL, length(ctxs) + _mth()), ctxs) end finally unlock(l) diff --git a/base/task.jl b/base/task.jl index 4ad562853efea..ce34d2f179fc5 100644 --- a/base/task.jl +++ b/base/task.jl @@ -754,7 +754,7 @@ function workqueue_for(tid::Int) @lock l begin qs = Workqueues if length(qs) < tid - nt = Threads.nthreads() + nt = Threads.maxthreadid() @assert tid <= nt global Workqueues = qs = copyto!(typeof(qs)(undef, length(qs) + nt - 1), qs) end @@ -767,7 +767,7 @@ end function enq_work(t::Task) (t._state === task_state_runnable && t.queue === nothing) || error("schedule: Task not runnable") - if t.sticky || Threads.nthreads() == 1 + if t.sticky || Threads.threadpoolsize() == 1 tid = Threads.threadid(t) if tid == 0 # Issue #41324 diff --git a/base/threadingconstructs.jl b/base/threadingconstructs.jl index 6c8ea35cfa373..271d6ea9f7664 100644 --- a/base/threadingconstructs.jl +++ b/base/threadingconstructs.jl @@ -11,20 +11,27 @@ ID `1`. """ threadid() = Int(ccall(:jl_threadid, Int16, ())+1) +# lower bound on the largest threadid() """ - Threads.nthreads([:default|:interactive]) -> Int + Threads.maxthreadid() -> Int -Get the number of threads (across all thread pools or within the specified -thread pool) available to Julia. The number of threads across all thread -pools is the inclusive upper bound on [`threadid()`](@ref). +Get a lower bound on the number of threads (across all thread pools) available +to the Julia process, with atomic-acquire semantics. The result will always be +greater than or equal to [`threadid()`](@ref) as well as `threadid(task)` for +any task you were able to observe before calling `maxthreadid`. +""" +maxthreadid() = Int(Core.Intrinsics.atomic_pointerref(cglobal(:jl_n_threads, Cint), :acquire)) -See also: `BLAS.get_num_threads` and `BLAS.set_num_threads` in the -[`LinearAlgebra`](@ref man-linalg) standard library, and `nprocs()` in the -[`Distributed`](@ref man-distributed) standard library. """ -function nthreads end + Threads.nthreads(:default | :interactive) -> Int -nthreads() = Int(unsafe_load(cglobal(:jl_n_threads, Cint))) +Get the current number of threads within the specified thread pool. The threads in default +have id numbers `1:nthreads(:default)`. + +See also `BLAS.get_num_threads` and `BLAS.set_num_threads` in the [`LinearAlgebra`](@ref +man-linalg) standard library, and `nprocs()` in the [`Distributed`](@ref man-distributed) +standard library and [`Threads.maxthreadid()`](@ref). +""" function nthreads(pool::Symbol) if pool === :default tpid = Int8(0) @@ -35,6 +42,7 @@ function nthreads(pool::Symbol) end return _nthreads_in_pool(tpid) end + function _nthreads_in_pool(tpid::Int8) p = unsafe_load(cglobal(:jl_n_threads_per_pool, Ptr{Cint})) return Int(unsafe_load(p, tpid + 1)) @@ -57,10 +65,20 @@ Returns the number of threadpools currently configured. """ nthreadpools() = Int(unsafe_load(cglobal(:jl_n_threadpools, Cint))) +""" + Threads.threadpoolsize() + +Get the number of threads available to the Julia default worker-thread pool. + +See also: `BLAS.get_num_threads` and `BLAS.set_num_threads` in the +[`LinearAlgebra`](@ref man-linalg) standard library, and `nprocs()` in the +[`Distributed`](@ref man-distributed) standard library. +""" +threadpoolsize() = Threads._nthreads_in_pool(Int8(0)) function threading_run(fun, static) ccall(:jl_enter_threaded_region, Cvoid, ()) - n = nthreads() + n = threadpoolsize() tasks = Vector{Task}(undef, n) for i = 1:n t = Task(() -> fun(i)) # pass in tid @@ -93,7 +111,7 @@ function _threadsfor(iter, lbody, schedule) tid = 1 len, rem = lenr, 0 else - len, rem = divrem(lenr, nthreads()) + len, rem = divrem(lenr, threadpoolsize()) end # not enough iterations for all the threads? if len == 0 @@ -185,7 +203,7 @@ assumption may be removed in the future. This scheduling option is merely a hint to the underlying execution mechanism. However, a few properties can be expected. The number of `Task`s used by `:dynamic` scheduler is bounded by a small constant multiple of the number of available worker threads -([`nthreads()`](@ref Threads.nthreads)). Each task processes contiguous regions of the +([`Threads.threadpoolsize()`](@ref)). Each task processes contiguous regions of the iteration space. Thus, `@threads :dynamic for x in xs; f(x); end` is typically more efficient than `@sync for x in xs; @spawn f(x); end` if `length(xs)` is significantly larger than the number of the worker threads and the run-time of `f(x)` is relatively @@ -222,7 +240,7 @@ julia> function busywait(seconds) julia> @time begin Threads.@spawn busywait(5) - Threads.@threads :static for i in 1:Threads.nthreads() + Threads.@threads :static for i in 1:Threads.threadpoolsize() busywait(1) end end @@ -230,7 +248,7 @@ julia> @time begin julia> @time begin Threads.@spawn busywait(5) - Threads.@threads :dynamic for i in 1:Threads.nthreads() + Threads.@threads :dynamic for i in 1:Threads.threadpoolsize() busywait(1) end end diff --git a/base/threads.jl b/base/threads.jl index 2b68c7104ee5e..2d388cc4b9f77 100644 --- a/base/threads.jl +++ b/base/threads.jl @@ -11,25 +11,4 @@ include("threadingconstructs.jl") include("atomics.jl") include("locks-mt.jl") - -""" - resize_nthreads!(A, copyvalue=A[1]) - -Resize the array `A` to length [`nthreads()`](@ref). Any new -elements that are allocated are initialized to `deepcopy(copyvalue)`, -where `copyvalue` defaults to `A[1]`. - -This is typically used to allocate per-thread variables, and -should be called in `__init__` if `A` is a global constant. -""" -function resize_nthreads!(A::AbstractVector, copyvalue=A[1]) - nthr = nthreads() - nold = length(A) - resize!(A, nthr) - for i = nold+1:nthr - A[i] = deepcopy(copyvalue) - end - return A -end - end diff --git a/base/threads_overloads.jl b/base/threads_overloads.jl index 376c1af94f441..7241d3182901d 100644 --- a/base/threads_overloads.jl +++ b/base/threads_overloads.jl @@ -3,7 +3,7 @@ """ Threads.foreach(f, channel::Channel; schedule::Threads.AbstractSchedule=Threads.FairSchedule(), - ntasks=Threads.nthreads()) + ntasks=Base.threadpoolsize()) Similar to `foreach(f, channel)`, but iteration over `channel` and calls to `f` are split across `ntasks` tasks spawned by `Threads.@spawn`. This function @@ -40,7 +40,7 @@ collect(d) = [1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256 """ function Threads.foreach(f, channel::Channel; schedule::Threads.AbstractSchedule=Threads.FairSchedule(), - ntasks=Threads.nthreads()) + ntasks=Threads.threadpoolsize()) apply = _apply_for_schedule(schedule) stop = Threads.Atomic{Bool}(false) @sync for _ in 1:ntasks diff --git a/cli/loader_exe.c b/cli/loader_exe.c index a5a9968896af6..9187d4f919cf4 100644 --- a/cli/loader_exe.c +++ b/cli/loader_exe.c @@ -15,7 +15,7 @@ extern "C" { JULIA_DEFINE_FAST_TLS #ifdef _COMPILER_ASAN_ENABLED_ -JL_DLLEXPORT const char* __asan_default_options() +JL_DLLEXPORT const char* __asan_default_options(void) { return "allow_user_segv_handler=1:detect_leaks=0"; // FIXME: enable LSAN after fixing leaks & defining __lsan_default_suppressions(), diff --git a/contrib/generate_precompile.jl b/contrib/generate_precompile.jl index acd61be502465..264036f4fb3ae 100644 --- a/contrib/generate_precompile.jl +++ b/contrib/generate_precompile.jl @@ -1,7 +1,7 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -if Threads.nthreads() != 1 - @warn "Running this file with multiple Julia threads may lead to a build error" Threads.nthreads() +if Threads.maxthreadid() != 1 + @warn "Running this file with multiple Julia threads may lead to a build error" Base.maxthreadid() end if Base.isempty(Base.ARGS) || Base.ARGS[1] !== "0" @@ -340,7 +340,7 @@ function generate_precompile_statements() # wait for the next prompt-like to appear readuntil(output_copy, "\n") strbuf = "" - while true + while !eof(output_copy) strbuf *= String(readavailable(output_copy)) occursin(JULIA_PROMPT, strbuf) && break occursin(PKG_PROMPT, strbuf) && break diff --git a/doc/src/base/multi-threading.md b/doc/src/base/multi-threading.md index 293857c1c6c65..4932aef4cc938 100644 --- a/doc/src/base/multi-threading.md +++ b/doc/src/base/multi-threading.md @@ -5,9 +5,11 @@ Base.Threads.@threads Base.Threads.foreach Base.Threads.@spawn Base.Threads.threadid +Base.Threads.maxthreadid Base.Threads.nthreads Base.Threads.threadpool Base.Threads.nthreadpools +Base.Threads.threadpoolsize ``` See also [Multi-Threading](@ref man-multithreading). diff --git a/doc/src/manual/embedding.md b/doc/src/manual/embedding.md index 26904d9ccffcd..0430d8a7c1ffb 100644 --- a/doc/src/manual/embedding.md +++ b/doc/src/manual/embedding.md @@ -604,7 +604,7 @@ The second condition above implies that you can not safely call `jl_...()` funct void *func(void*) { // Wrong, jl_eval_string() called from thread that was not started by Julia - jl_eval_string("println(Threads.nthreads())"); + jl_eval_string("println(Threads.threadid())"); return NULL; } @@ -630,7 +630,7 @@ void *func(void*) // Okay, all jl_...() calls from the same thread, // even though it is not the main application thread jl_init(); - jl_eval_string("println(Threads.nthreads())"); + jl_eval_string("println(Threads.threadid())"); jl_atexit_hook(0); return NULL; } @@ -670,7 +670,7 @@ int main() jl_eval_string("func(i) = ccall(:c_func, Float64, (Int32,), i)"); // Call func() multiple times, using multiple threads to do so - jl_eval_string("println(Threads.nthreads())"); + jl_eval_string("println(Base.threadpoolsize())"); jl_eval_string("use(i) = println(\"[J $(Threads.threadid())] i = $(i) -> $(func(i))\")"); jl_eval_string("Threads.@threads for i in 1:5 use(i) end"); diff --git a/doc/src/manual/multi-threading.md b/doc/src/manual/multi-threading.md index 9ebba4fd7f676..b012de27ac81f 100644 --- a/doc/src/manual/multi-threading.md +++ b/doc/src/manual/multi-threading.md @@ -267,7 +267,7 @@ avoid the race: ```julia-repl julia> using Base.Threads -julia> nthreads() +julia> Threads.nthreads() 4 julia> acc = Ref(0) diff --git a/src/ccall.cpp b/src/ccall.cpp index fb5799b081537..b2e66a1345f96 100644 --- a/src/ccall.cpp +++ b/src/ccall.cpp @@ -1552,7 +1552,8 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs) assert(lrt == getVoidTy(ctx.builder.getContext())); assert(!isVa && !llvmcall && nccallargs == 0); JL_GC_POP(); - emit_gc_safepoint(ctx); + ctx.builder.CreateCall(prepare_call(gcroot_flush_func)); + emit_gc_safepoint(ctx.builder, get_current_ptls(ctx), ctx.tbaa().tbaa_const); return ghostValue(ctx, jl_nothing_type); } else if (is_libjulia_func("jl_get_ptls_states")) { @@ -1655,7 +1656,8 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs) ctx.builder.SetInsertPoint(checkBB); ctx.builder.CreateLoad( getSizeTy(ctx.builder.getContext()), - ctx.builder.CreateConstInBoundsGEP1_32(getSizeTy(ctx.builder.getContext()), get_current_signal_page(ctx), -1), + ctx.builder.CreateConstInBoundsGEP1_32(getSizeTy(ctx.builder.getContext()), + get_current_signal_page_from_ptls(ctx.builder, get_current_ptls(ctx), ctx.tbaa().tbaa_const), -1), true); ctx.builder.CreateBr(contBB); ctx.f->getBasicBlockList().push_back(contBB); diff --git a/src/cgutils.cpp b/src/cgutils.cpp index c42e6f14473b3..9b53b50268d06 100644 --- a/src/cgutils.cpp +++ b/src/cgutils.cpp @@ -8,7 +8,6 @@ STATISTIC(EmittedPointerFromObjref, "Number of emitted pointer_from_objref calls"); STATISTIC(EmittedPointerBitcast, "Number of emitted pointer bitcasts"); -STATISTIC(EmittedNthPtrAddr, "Number of emitted nth pointer address instructions"); STATISTIC(EmittedTypeof, "Number of emitted typeof instructions"); STATISTIC(EmittedErrors, "Number of emitted errors"); STATISTIC(EmittedConditionalErrors, "Number of emitted conditional errors"); @@ -42,7 +41,6 @@ STATISTIC(EmittedCPointerChecks, "Number of C pointer checks emitted"); STATISTIC(EmittedAllocObjs, "Number of object allocations emitted"); STATISTIC(EmittedWriteBarriers, "Number of write barriers emitted"); STATISTIC(EmittedNewStructs, "Number of new structs emitted"); -STATISTIC(EmittedSignalFences, "Number of signal fences emitted"); STATISTIC(EmittedDeferSignal, "Number of deferred signals emitted"); static Value *track_pjlvalue(jl_codectx_t &ctx, Value *V) @@ -971,41 +969,20 @@ static void emit_memcpy(jl_codectx_t &ctx, Value *dst, MDNode *tbaa_dst, const j emit_memcpy_llvm(ctx, dst, tbaa_dst, data_pointer(ctx, src), src.tbaa, sz, align, is_volatile); } -static Value *emit_nthptr_addr(jl_codectx_t &ctx, Value *v, ssize_t n, bool gctracked = true) -{ - ++EmittedNthPtrAddr; - return ctx.builder.CreateInBoundsGEP( - ctx.types().T_prjlvalue, - emit_bitcast(ctx, maybe_decay_tracked(ctx, v), ctx.types().T_pprjlvalue), - ConstantInt::get(getSizeTy(ctx.builder.getContext()), n)); -} - -static Value *emit_nthptr_addr(jl_codectx_t &ctx, Value *v, Value *idx) +static LoadInst *emit_nthptr_recast(jl_codectx_t &ctx, Value *v, Value *idx, MDNode *tbaa, Type *type) { - ++EmittedNthPtrAddr; - return ctx.builder.CreateInBoundsGEP( + // p = (jl_value_t**)v; *(type*)&p[n] + Value *vptr = ctx.builder.CreateInBoundsGEP( ctx.types().T_prjlvalue, emit_bitcast(ctx, maybe_decay_tracked(ctx, v), ctx.types().T_pprjlvalue), idx); + LoadInst *load = ctx.builder.CreateLoad(type, emit_bitcast(ctx, vptr, PointerType::get(type, 0))); + tbaa_decorate(tbaa, load); + return load; } -static LoadInst *emit_nthptr_recast(jl_codectx_t &ctx, Value *v, Value *idx, MDNode *tbaa, Type *type) -{ - // p = (jl_value_t**)v; *(type*)&p[n] - Value *vptr = emit_nthptr_addr(ctx, v, idx); - return cast(tbaa_decorate(tbaa, ctx.builder.CreateLoad(type, - emit_bitcast(ctx, vptr, PointerType::get(type, 0))))); -} - -static LoadInst *emit_nthptr_recast(jl_codectx_t &ctx, Value *v, ssize_t n, MDNode *tbaa, Type *type) -{ - // p = (jl_value_t**)v; *(type*)&p[n] - Value *vptr = emit_nthptr_addr(ctx, v, n); - return cast(tbaa_decorate(tbaa, ctx.builder.CreateLoad(type, - emit_bitcast(ctx, vptr, PointerType::get(type, 0))))); - } - static Value *boxed(jl_codectx_t &ctx, const jl_cgval_t &v, bool is_promotable=false); + static Value *emit_typeof(jl_codectx_t &ctx, Value *v, bool maybenull); static jl_cgval_t emit_typeof(jl_codectx_t &ctx, const jl_cgval_t &p, bool maybenull) @@ -1177,8 +1154,12 @@ static Value *emit_datatype_isprimitivetype(jl_codectx_t &ctx, Value *dt) static Value *emit_datatype_name(jl_codectx_t &ctx, Value *dt) { - Value *vptr = emit_nthptr_addr(ctx, dt, (ssize_t)(offsetof(jl_datatype_t, name) / sizeof(char*))); - return tbaa_decorate(ctx.tbaa().tbaa_const, ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, vptr, Align(sizeof(void*)))); + unsigned n = offsetof(jl_datatype_t, name) / sizeof(char*); + Value *vptr = ctx.builder.CreateInBoundsGEP( + ctx.types().T_pjlvalue, + emit_bitcast(ctx, maybe_decay_tracked(ctx, dt), ctx.types().T_ppjlvalue), + ConstantInt::get(getSizeTy(ctx.builder.getContext()), n)); + return tbaa_decorate(ctx.tbaa().tbaa_const, ctx.builder.CreateAlignedLoad(ctx.types().T_pjlvalue, vptr, Align(sizeof(void*)))); } // --- generating various error checks --- @@ -1508,8 +1489,8 @@ static std::pair emit_isa(jl_codectx_t &ctx, const jl_cgval_t &x, // so the isa test reduces to a comparison of the typename by pointer return std::make_pair( ctx.builder.CreateICmpEQ( - mark_callee_rooted(ctx, emit_datatype_name(ctx, emit_typeof_boxed(ctx, x))), - mark_callee_rooted(ctx, literal_pointer_val(ctx, (jl_value_t*)dt->name))), + emit_datatype_name(ctx, emit_typeof_boxed(ctx, x)), + literal_pointer_val(ctx, (jl_value_t*)dt->name)), false); } if (jl_is_uniontype(intersected_type) && @@ -3445,10 +3426,10 @@ static void emit_cpointercheck(jl_codectx_t &ctx, const jl_cgval_t &x, const std emit_typecheck(ctx, mark_julia_type(ctx, t, true, jl_any_type), (jl_value_t*)jl_datatype_type, msg); Value *istype = - ctx.builder.CreateICmpEQ(mark_callee_rooted(ctx, emit_datatype_name(ctx, t)), - mark_callee_rooted(ctx, literal_pointer_val(ctx, (jl_value_t*)jl_pointer_typename))); - BasicBlock *failBB = BasicBlock::Create(ctx.builder.getContext(),"fail",ctx.f); - BasicBlock *passBB = BasicBlock::Create(ctx.builder.getContext(),"pass"); + ctx.builder.CreateICmpEQ(emit_datatype_name(ctx, t), + literal_pointer_val(ctx, (jl_value_t*)jl_pointer_typename)); + BasicBlock *failBB = BasicBlock::Create(ctx.builder.getContext(), "fail", ctx.f); + BasicBlock *passBB = BasicBlock::Create(ctx.builder.getContext(), "pass"); ctx.builder.CreateCondBr(istype, passBB, failBB); ctx.builder.SetInsertPoint(failBB); @@ -3896,8 +3877,7 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg static void emit_signal_fence(jl_codectx_t &ctx) { - ++EmittedSignalFences; - ctx.builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SyncScope::SingleThread); + emit_signal_fence(ctx.builder); } static Value *emit_defer_signal(jl_codectx_t &ctx) @@ -3910,69 +3890,6 @@ static Value *emit_defer_signal(jl_codectx_t &ctx) return ctx.builder.CreateInBoundsGEP(ctx.types().T_sigatomic, ptls, ArrayRef(offset), "jl_defer_signal"); } -static void emit_gc_safepoint(jl_codectx_t &ctx) -{ - ctx.builder.CreateCall(prepare_call(gcroot_flush_func)); - emit_signal_fence(ctx); - ctx.builder.CreateLoad(getSizeTy(ctx.builder.getContext()), get_current_signal_page(ctx), true); - emit_signal_fence(ctx); -} - -static Value *emit_gc_state_set(jl_codectx_t &ctx, Value *state, Value *old_state) -{ - Type *T_int8 = state->getType(); - Value *ptls = emit_bitcast(ctx, get_current_ptls(ctx), getInt8PtrTy(ctx.builder.getContext())); - Constant *offset = ConstantInt::getSigned(getInt32Ty(ctx.builder.getContext()), offsetof(jl_tls_states_t, gc_state)); - Value *gc_state = ctx.builder.CreateInBoundsGEP(T_int8, ptls, ArrayRef(offset), "gc_state"); - if (old_state == nullptr) { - old_state = ctx.builder.CreateLoad(T_int8, gc_state); - cast(old_state)->setOrdering(AtomicOrdering::Monotonic); - } - ctx.builder.CreateAlignedStore(state, gc_state, Align(sizeof(void*)))->setOrdering(AtomicOrdering::Release); - if (auto *C = dyn_cast(old_state)) - if (C->isZero()) - return old_state; - if (auto *C = dyn_cast(state)) - if (!C->isZero()) - return old_state; - BasicBlock *passBB = BasicBlock::Create(ctx.builder.getContext(), "safepoint", ctx.f); - BasicBlock *exitBB = BasicBlock::Create(ctx.builder.getContext(), "after_safepoint", ctx.f); - Constant *zero8 = ConstantInt::get(T_int8, 0); - ctx.builder.CreateCondBr(ctx.builder.CreateAnd(ctx.builder.CreateICmpNE(old_state, zero8), // if (old_state && !state) - ctx.builder.CreateICmpEQ(state, zero8)), - passBB, exitBB); - ctx.builder.SetInsertPoint(passBB); - emit_gc_safepoint(ctx); - ctx.builder.CreateBr(exitBB); - ctx.builder.SetInsertPoint(exitBB); - return old_state; -} - -static Value *emit_gc_unsafe_enter(jl_codectx_t &ctx) -{ - Value *state = ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 0); - return emit_gc_state_set(ctx, state, nullptr); -} - -static Value *emit_gc_unsafe_leave(jl_codectx_t &ctx, Value *state) -{ - Value *old_state = ConstantInt::get(state->getType(), 0); - return emit_gc_state_set(ctx, state, old_state); -} - -//static Value *emit_gc_safe_enter(jl_codectx_t &ctx) -//{ -// Value *state = ConstantInt::get(getInt8Ty(ctx.builder.getContext()), JL_GC_STATE_SAFE); -// return emit_gc_state_set(ctx, state, nullptr); -//} -// -//static Value *emit_gc_safe_leave(jl_codectx_t &ctx, Value *state) -//{ -// Value *old_state = ConstantInt::get(state->getType(), JL_GC_STATE_SAFE); -// return emit_gc_state_set(ctx, state, old_state); -//} - - #ifndef JL_NDEBUG static int compare_cgparams(const jl_cgparams_t *a, const jl_cgparams_t *b) diff --git a/src/codegen.cpp b/src/codegen.cpp index 9c09314c9aee1..372e9bc560e15 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -588,6 +588,11 @@ static const auto jlpgcstack_func = new JuliaFunction{ nullptr, }; +static const auto jladoptthread_func = new JuliaFunction{ + "julia.get_pgcstack_or_new", + jlpgcstack_func->_type, + jlpgcstack_func->_attrs, +}; // important functions @@ -1492,11 +1497,9 @@ static Value *global_binding_pointer(jl_codectx_t &ctx, jl_module_t *m, jl_sym_t static jl_cgval_t emit_checked_var(jl_codectx_t &ctx, Value *bp, jl_sym_t *name, bool isvol, MDNode *tbaa); static jl_cgval_t emit_sparam(jl_codectx_t &ctx, size_t i); static Value *emit_condition(jl_codectx_t &ctx, const jl_cgval_t &condV, const std::string &msg); -static void allocate_gc_frame(jl_codectx_t &ctx, BasicBlock *b0); static Value *get_current_task(jl_codectx_t &ctx); static Value *get_current_ptls(jl_codectx_t &ctx); static Value *get_last_age_field(jl_codectx_t &ctx); -static Value *get_current_signal_page(jl_codectx_t &ctx); static void CreateTrap(IRBuilder<> &irbuilder, bool create_new_block = true); static CallInst *emit_jlcall(jl_codectx_t &ctx, Function *theFptr, Value *theF, const jl_cgval_t *args, size_t nargs, JuliaFunction *trampoline); @@ -5314,21 +5317,17 @@ JL_GCC_IGNORE_STOP // --- generate function bodies --- // gc frame emission -static void allocate_gc_frame(jl_codectx_t &ctx, BasicBlock *b0) +static void allocate_gc_frame(jl_codectx_t &ctx, BasicBlock *b0, bool or_new=false) { // allocate a placeholder gc instruction // this will require the runtime, but it gets deleted later if unused - ctx.topalloca = ctx.builder.CreateCall(prepare_call(jlpgcstack_func)); + ctx.topalloca = ctx.builder.CreateCall(prepare_call(or_new ? jladoptthread_func : jlpgcstack_func)); ctx.pgcstack = ctx.topalloca; } static Value *get_current_task(jl_codectx_t &ctx) { - const int ptls_offset = offsetof(jl_task_t, gcstack); - return ctx.builder.CreateInBoundsGEP( - ctx.types().T_pjlvalue, emit_bitcast(ctx, ctx.pgcstack, ctx.types().T_ppjlvalue), - ConstantInt::get(getSizeTy(ctx.builder.getContext()), -(ptls_offset / sizeof(void *))), - "current_task"); + return get_current_task_from_pgcstack(ctx.builder, ctx.pgcstack); } // Get PTLS through current task. @@ -5348,15 +5347,6 @@ static Value *get_last_age_field(jl_codectx_t &ctx) "world_age"); } -// Get signal page through current task. -static Value *get_current_signal_page(jl_codectx_t &ctx) -{ - // return ctx.builder.CreateCall(prepare_call(reuse_signal_page_func)); - Value *ptls = get_current_ptls(ctx); - int nthfield = offsetof(jl_tls_states_t, safepoint) / sizeof(void *); - return emit_nthptr_recast(ctx, ptls, nthfield, ctx.tbaa().tbaa_const, getSizePtrTy(ctx.builder.getContext())); -} - static Function *emit_tojlinvoke(jl_code_instance_t *codeinst, Module *M, jl_codegen_params_t ¶ms) { ++EmittedToJLInvokes; @@ -5641,19 +5631,11 @@ static Function* gen_cfun_wrapper( ctx.builder.SetInsertPoint(b0); DebugLoc noDbg; ctx.builder.SetCurrentDebugLocation(noDbg); - allocate_gc_frame(ctx, b0); + allocate_gc_frame(ctx, b0, true); - Value *dummy_world = ctx.builder.CreateAlloca(getSizeTy(ctx.builder.getContext())); - Value *have_tls = ctx.builder.CreateIsNotNull(ctx.pgcstack); - // TODO: in the future, initialize a full TLS context here Value *world_age_field = get_last_age_field(ctx); - world_age_field = ctx.builder.CreateSelect(have_tls, world_age_field, dummy_world); Value *last_age = tbaa_decorate(ctx.tbaa().tbaa_gcframe, ctx.builder.CreateAlignedLoad(getSizeTy(ctx.builder.getContext()), world_age_field, Align(sizeof(size_t)))); - Value *last_gc_state = ConstantInt::get(getInt8Ty(ctx.builder.getContext()), JL_GC_STATE_SAFE); - last_gc_state = emit_guarded_test(ctx, have_tls, last_gc_state, [&] { - return emit_gc_unsafe_enter(ctx); - }); Value *world_v = ctx.builder.CreateAlignedLoad(getSizeTy(ctx.builder.getContext()), prepare_global_in(jl_Module, jlgetworld_global), Align(sizeof(size_t))); @@ -5668,12 +5650,7 @@ static Function* gen_cfun_wrapper( emit_bitcast(ctx, literal_pointer_val(ctx, (jl_value_t*)codeinst), getSizePtrTy(ctx.builder.getContext())), offsetof(jl_code_instance_t, max_world) / sizeof(size_t)), Align(sizeof(size_t))); - // XXX: age is always OK if we don't have a TLS. This is a hack required due to `@threadcall` abuse. - // and adds quite a bit of complexity here, even though it's still wrong - // (anything that tries to interact with the runtime will fault) age_ok = ctx.builder.CreateICmpUGE(lam_max, world_v); - world_v = ctx.builder.CreateSelect(ctx.builder.CreateOr(have_tls, age_ok), world_v, lam_max); - age_ok = ctx.builder.CreateOr(ctx.builder.CreateNot(have_tls), age_ok); } ctx.builder.CreateStore(world_v, world_age_field); @@ -6030,12 +6007,6 @@ static Function* gen_cfun_wrapper( } ctx.builder.CreateStore(last_age, world_age_field); - if (!sig.retboxed) { - emit_guarded_test(ctx, have_tls, nullptr, [&] { - emit_gc_unsafe_leave(ctx, last_gc_state); - return nullptr; - }); - } ctx.builder.CreateRet(r); ctx.builder.SetCurrentDebugLocation(noDbg); @@ -8458,6 +8429,7 @@ static void init_jit_functions(void) add_named_global(jl_write_barrier_func, (void*)NULL); add_named_global(jl_write_barrier_binding_func, (void*)NULL); add_named_global(jldlsym_func, &jl_load_and_lookup); + add_named_global("jl_adopt_thread", &jl_adopt_thread); add_named_global(jlgetcfunctiontrampoline_func, &jl_get_cfunction_trampoline); add_named_global(jlgetnthfieldchecked_func, &jl_get_nth_field_checked); add_named_global(diff_gc_total_bytes_func, &jl_gc_diff_total_bytes); diff --git a/src/codegen_shared.h b/src/codegen_shared.h index 0e68668378f4e..329cc567e8c5f 100644 --- a/src/codegen_shared.h +++ b/src/codegen_shared.h @@ -22,6 +22,7 @@ enum AddressSpace { }; static inline auto getSizeTy(llvm::LLVMContext &ctxt) { + //return M.getDataLayout().getIntPtrType(M.getContext()); if (sizeof(size_t) > sizeof(uint32_t)) { return llvm::Type::getInt64Ty(ctxt); } else { @@ -176,26 +177,127 @@ static inline llvm::Value *emit_bitcast_with_builder(llvm::IRBuilder<> &builder, } } +// Get PTLS through current task. +static inline llvm::Value *get_current_task_from_pgcstack(llvm::IRBuilder<> &builder, llvm::Value *pgcstack) +{ + using namespace llvm; + auto T_ppjlvalue = JuliaType::get_ppjlvalue_ty(builder.getContext()); + auto T_pjlvalue = JuliaType::get_pjlvalue_ty(builder.getContext()); + const int pgcstack_offset = offsetof(jl_task_t, gcstack); + return builder.CreateInBoundsGEP( + T_pjlvalue, emit_bitcast_with_builder(builder, pgcstack, T_ppjlvalue), + ConstantInt::get(getSizeTy(builder.getContext()), -(pgcstack_offset / sizeof(void *))), + "current_task"); +} + // Get PTLS through current task. static inline llvm::Value *get_current_ptls_from_task(llvm::IRBuilder<> &builder, llvm::Value *current_task, llvm::MDNode *tbaa) { using namespace llvm; auto T_ppjlvalue = JuliaType::get_ppjlvalue_ty(builder.getContext()); auto T_pjlvalue = JuliaType::get_pjlvalue_ty(builder.getContext()); - auto T_size = builder.GetInsertBlock()->getModule()->getDataLayout().getIntPtrType(builder.getContext()); + auto T_size = getSizeTy(builder.getContext()); const int ptls_offset = offsetof(jl_task_t, ptls); llvm::Value *pptls = builder.CreateInBoundsGEP( - T_pjlvalue, current_task, - ConstantInt::get(T_size, ptls_offset / sizeof(void *)), - "ptls_field"); + T_pjlvalue, current_task, + ConstantInt::get(T_size, ptls_offset / sizeof(void *)), + "ptls_field"); LoadInst *ptls_load = builder.CreateAlignedLoad(T_pjlvalue, - emit_bitcast_with_builder(builder, pptls, T_ppjlvalue), Align(sizeof(void *)), "ptls_load"); + emit_bitcast_with_builder(builder, pptls, T_ppjlvalue), Align(sizeof(void *)), "ptls_load"); // Note: Corresponding store (`t->ptls = ptls`) happens in `ctx_switch` of tasks.c. tbaa_decorate(tbaa, ptls_load); - // Using `CastInst::Create` to get an `Instruction*` without explicit cast: - auto ptls = CastInst::Create(Instruction::BitCast, ptls_load, T_ppjlvalue, "ptls"); - builder.Insert(ptls); - return ptls; + return builder.CreateBitCast(ptls_load, T_ppjlvalue, "ptls"); +} + +// Get signal page through current task. +static inline llvm::Value *get_current_signal_page_from_ptls(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::MDNode *tbaa) +{ + using namespace llvm; + // return builder.CreateCall(prepare_call(reuse_signal_page_func)); + auto T_size = getSizeTy(builder.getContext()); + auto T_psize = T_size->getPointerTo(); + auto T_ppsize = T_psize->getPointerTo(); + int nthfield = offsetof(jl_tls_states_t, safepoint) / sizeof(void *); + ptls = emit_bitcast_with_builder(builder, ptls, T_ppsize); + llvm::Value *psafepoint = builder.CreateInBoundsGEP( + T_psize, ptls, ConstantInt::get(T_size, nthfield)); + LoadInst *ptls_load = builder.CreateAlignedLoad( + T_psize, psafepoint, Align(sizeof(void *)), "safepoint"); + tbaa_decorate(tbaa, ptls_load); + return ptls_load; +} + +static inline void emit_signal_fence(llvm::IRBuilder<> &builder) +{ + using namespace llvm; + builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SyncScope::SingleThread); +} + +static inline void emit_gc_safepoint(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::MDNode *tbaa) +{ + emit_signal_fence(builder); + builder.CreateLoad(getSizeTy(builder.getContext()), get_current_signal_page_from_ptls(builder, ptls, tbaa), true); + emit_signal_fence(builder); +} + +static inline llvm::Value *emit_gc_state_set(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state, llvm::Value *old_state) +{ + using namespace llvm; + Type *T_int8 = state->getType(); + ptls = emit_bitcast_with_builder(builder, ptls, builder.getInt8PtrTy()); + Constant *offset = ConstantInt::getSigned(builder.getInt32Ty(), offsetof(jl_tls_states_t, gc_state)); + Value *gc_state = builder.CreateInBoundsGEP(T_int8, ptls, ArrayRef(offset), "gc_state"); + if (old_state == nullptr) { + old_state = builder.CreateLoad(T_int8, gc_state); + cast(old_state)->setOrdering(AtomicOrdering::Monotonic); + } + builder.CreateAlignedStore(state, gc_state, Align(sizeof(void*)))->setOrdering(AtomicOrdering::Release); + if (auto *C = dyn_cast(old_state)) + if (C->isZero()) + return old_state; + if (auto *C = dyn_cast(state)) + if (!C->isZero()) + return old_state; + BasicBlock *passBB = BasicBlock::Create(builder.getContext(), "safepoint", builder.GetInsertBlock()->getParent()); + BasicBlock *exitBB = BasicBlock::Create(builder.getContext(), "after_safepoint", builder.GetInsertBlock()->getParent()); + Constant *zero8 = ConstantInt::get(T_int8, 0); + builder.CreateCondBr(builder.CreateAnd(builder.CreateICmpNE(old_state, zero8), // if (old_state && !state) + builder.CreateICmpEQ(state, zero8)), + passBB, exitBB); + builder.SetInsertPoint(passBB); + MDNode *tbaa = get_tbaa_const(builder.getContext()); + emit_gc_safepoint(builder, ptls, tbaa); + builder.CreateBr(exitBB); + builder.SetInsertPoint(exitBB); + return old_state; +} + +static inline llvm::Value *emit_gc_unsafe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls) +{ + using namespace llvm; + Value *state = builder.getInt8(0); + return emit_gc_state_set(builder, ptls, state, nullptr); +} + +static inline llvm::Value *emit_gc_unsafe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state) +{ + using namespace llvm; + Value *old_state = builder.getInt8(0); + return emit_gc_state_set(builder, ptls, state, old_state); +} + +static inline llvm::Value *emit_gc_safe_enter(llvm::IRBuilder<> &builder, llvm::Value *ptls) +{ + using namespace llvm; + Value *state = builder.getInt8(JL_GC_STATE_SAFE); + return emit_gc_state_set(builder, ptls, state, nullptr); +} + +static inline llvm::Value *emit_gc_safe_leave(llvm::IRBuilder<> &builder, llvm::Value *ptls, llvm::Value *state) +{ + using namespace llvm; + Value *old_state = builder.getInt8(JL_GC_STATE_SAFE); + return emit_gc_state_set(builder, ptls, state, old_state); } // Compatibility shims for LLVM attribute APIs that were renamed in LLVM 14. @@ -327,5 +429,4 @@ inline Attribute getAttributeAtIndex(const AttributeList &L, unsigned Index, Att return L.getAttribute(Index, Kind); #endif } - } diff --git a/src/gc-alloc-profiler.cpp b/src/gc-alloc-profiler.cpp index 818d6e803c9df..1bcbeb2189f5f 100644 --- a/src/gc-alloc-profiler.cpp +++ b/src/gc-alloc-profiler.cpp @@ -80,7 +80,8 @@ extern "C" { // Needed since these functions doesn't take any arguments. JL_DLLEXPORT void jl_start_alloc_profile(double sample_rate) { // We only need to do this once, the first time this is called. - while (g_alloc_profile.per_thread_profiles.size() < (size_t)jl_n_threads) { + size_t nthreads = jl_atomic_load_acquire(&jl_n_threads); + while (g_alloc_profile.per_thread_profiles.size() < nthreads) { g_alloc_profile.per_thread_profiles.push_back(jl_per_thread_alloc_profile_t{}); } @@ -131,7 +132,10 @@ JL_DLLEXPORT void jl_free_alloc_profile() { void _maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t *type) JL_NOTSAFEPOINT { auto& global_profile = g_alloc_profile; - auto thread_id = jl_atomic_load_relaxed(&jl_current_task->tid); + size_t thread_id = jl_atomic_load_relaxed(&jl_current_task->tid); + if (thread_id >= global_profile.per_thread_profiles.size()) + return; // ignore allocations on threads started after the alloc-profile started + auto& profile = global_profile.per_thread_profiles[thread_id]; auto sample_val = double(rand()) / double(RAND_MAX); diff --git a/src/gc-debug.c b/src/gc-debug.c index 040502c805448..aa9dd5abda01b 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -99,7 +99,7 @@ static arraylist_t bits_save[4]; static void gc_clear_mark_page(jl_gc_pagemeta_t *pg, int bits) { - jl_ptls_t ptls2 = jl_all_tls_states[pg->thread_n]; + jl_ptls_t ptls2 = gc_all_tls_states[pg->thread_n]; jl_gc_pool_t *pool = &ptls2->heap.norm_pools[pg->pool_n]; jl_taggedvalue_t *pv = (jl_taggedvalue_t*)(pg->data + GC_PAGE_OFFSET); char *lim = (char*)pv + GC_PAGE_SZ - GC_PAGE_OFFSET - pool->osize; @@ -164,8 +164,8 @@ static void clear_mark(int bits) } } bigval_t *v; - for (int i = 0;i < jl_n_threads;i++) { - v = jl_all_tls_states[i]->heap.big_objects; + for (int i = 0; i < gc_n_threads; i++) { + v = gc_all_tls_states[i]->heap.big_objects; while (v != NULL) { void *gcv = &v->header; if (!gc_verifying) @@ -207,8 +207,8 @@ static void gc_verify_track(jl_ptls_t ptls) clear_mark(GC_CLEAN); gc_mark_queue_all_roots(ptls, &sp); gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - for (int i = 0;i < jl_n_threads;i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); } gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); @@ -256,8 +256,8 @@ void gc_verify(jl_ptls_t ptls) gc_verifying = 1; gc_mark_queue_all_roots(ptls, &sp); gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0); - for (int i = 0;i < jl_n_threads;i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); } gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0); @@ -297,7 +297,7 @@ static void gc_verify_tags_page(jl_gc_pagemeta_t *pg) // for all pages in use int p_n = pg->pool_n; int t_n = pg->thread_n; - jl_ptls_t ptls2 = jl_all_tls_states[t_n]; + jl_ptls_t ptls2 = gc_all_tls_states[t_n]; jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n]; int osize = pg->osize; char *data = pg->data; @@ -401,8 +401,8 @@ static void gc_verify_tags_pagetable(void) void gc_verify_tags(void) { // verify the freelist chains look valid - for (int t_i = 0; t_i < jl_n_threads; t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; for (int i = 0; i < JL_GC_N_POOLS; i++) { // for all pools, iterate its freelist jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; @@ -467,7 +467,7 @@ static void gc_debug_alloc_init(jl_alloc_num_t *num, const char *name) return; if (*env == 'r') { env++; - for (int i = 0;i < 3;i++) { + for (int i = 0; i < 3; i++) { while (num->random[i] == 0) { num->random[i] = jl_rand(); } @@ -577,7 +577,7 @@ static void gc_scrub_task(jl_task_t *ta) jl_ptls_t ptls = jl_current_task->ptls; jl_ptls_t ptls2 = NULL; if (tid != -1) - ptls2 = jl_all_tls_states[tid]; + ptls2 = gc_all_tls_states[tid]; char *low; char *high; @@ -946,8 +946,8 @@ void gc_time_mark_pause(int64_t t0, int64_t scanned_bytes, { int64_t last_remset_len = 0; int64_t remset_nptr = 0; - for (int t_i = 0;t_i < jl_n_threads;t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; last_remset_len += ptls2->heap.last_remset->len; remset_nptr = ptls2->heap.remset_nptr; } @@ -1023,7 +1023,7 @@ void jl_gc_debug_init(void) #endif #ifdef OBJPROFILE - for (int g = 0;g < 3;g++) { + for (int g = 0; g < 3; g++) { htable_new(&obj_counts[g], 0); htable_new(&obj_sizes[g], 0); } @@ -1085,8 +1085,8 @@ void gc_stats_all_pool(void) { size_t nb=0, w, tw=0, no=0, tp=0, nold=0, noldbytes=0, np, nol; for (int i = 0; i < JL_GC_N_POOLS; i++) { - for (int t_i = 0; t_i < jl_n_threads; t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; size_t b = pool_stats(&ptls2->heap.norm_pools[i], &w, &np, &nol); nb += b; no += (b / ptls2->heap.norm_pools[i].osize); @@ -1110,8 +1110,8 @@ void gc_stats_all_pool(void) void gc_stats_big_obj(void) { size_t nused=0, nbytes=0, nused_old=0, nbytes_old=0; - for (int t_i = 0; t_i < jl_n_threads; t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; bigval_t *v = ptls2->heap.big_objects; while (v != NULL) { if (gc_marked(v->bits.gc)) { @@ -1219,7 +1219,7 @@ void gc_count_pool(void) empty_pages = 0; gc_count_pool_pagetable(); jl_safe_printf("****** Pool stat: ******\n"); - for (int i = 0;i < 4;i++) + for (int i = 0; i < 4; i++) jl_safe_printf("bits(%d): %" PRId64 "\n", i, poolobj_sizes[i]); // empty_pages is inaccurate after the sweep since young objects are // also GC_CLEAN diff --git a/src/gc-stacks.c b/src/gc-stacks.c index 8617e773efc67..40292cf472037 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -194,8 +194,9 @@ void sweep_stack_pools(void) // bufsz = t->bufsz // if (stkbuf) // push(free_stacks[sz], stkbuf) - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; // free half of stacks that remain unused since last sweep for (int p = 0; p < JL_N_STACK_POOLS; p++) { diff --git a/src/gc.c b/src/gc.c index b2ec0fdb75597..8df7c7b242570 100644 --- a/src/gc.c +++ b/src/gc.c @@ -171,6 +171,8 @@ static _Atomic(int) support_conservative_marking = 0; jl_gc_num_t gc_num = {0}; static size_t last_long_collect_interval; +int gc_n_threads; +jl_ptls_t* gc_all_tls_states; pagetable_t memory_map; @@ -193,12 +195,15 @@ NOINLINE uintptr_t gc_get_stack_ptr(void) #define should_timeout() 0 -static void jl_gc_wait_for_the_world(void) +void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads) { - if (jl_n_threads > 1) + assert(gc_n_threads); + if (gc_n_threads > 1) jl_wake_libuv(); - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; // This acquire load pairs with the release stores // in the signal handler of safepoint so we are sure that // all the stores on those threads are visible. @@ -210,6 +215,9 @@ static void jl_gc_wait_for_the_world(void) } } + +void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads); + // malloc wrappers, aligned allocation #if defined(_OS_WINDOWS_) @@ -272,8 +280,8 @@ static void schedule_finalization(void *o, void *f) JL_NOTSAFEPOINT static void run_finalizer(jl_task_t *ct, jl_value_t *o, jl_value_t *ff) { - if (gc_ptr_tag(o, 1)) { - ((void (*)(void*))ff)(gc_ptr_clear_tag(o, 1)); + if (gc_ptr_tag(o, 3)) { + ((void (*)(void*))ff)(gc_ptr_clear_tag(o, 3)); return; } jl_value_t *args[2] = {ff,o}; @@ -414,7 +422,10 @@ static void run_finalizers(jl_task_t *ct) jl_rng_split(ct->rngState, finalizer_rngState); // This releases the finalizers lock. + int8_t was_in_finalizer = ct->ptls->in_finalizer; + ct->ptls->in_finalizer = 1; jl_gc_run_finalizers_in_list(ct, &copied_list); + ct->ptls->in_finalizer = was_in_finalizer; arraylist_free(&copied_list); memcpy(&ct->rngState[0], &save_rngState[0], sizeof(save_rngState)); @@ -426,9 +437,7 @@ JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct) ct = jl_current_task; jl_ptls_t ptls = ct->ptls; if (!ptls->in_finalizer && ptls->locks.len == 0 && ptls->finalizers_inhibited == 0) { - ptls->in_finalizer = 1; run_finalizers(ct); - ptls->in_finalizer = 0; } } @@ -499,13 +508,18 @@ static void schedule_all_finalizers(arraylist_t *flist) JL_NOTSAFEPOINT void jl_gc_run_all_finalizers(jl_task_t *ct) { + int gc_n_threads; + jl_ptls_t* gc_all_tls_states; + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); schedule_all_finalizers(&finalizer_list_marked); - // This could be run before we had a chance to setup all threads - for (int i = 0;i < jl_n_threads;i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; if (ptls2) schedule_all_finalizers(&ptls2->finalizers); } + gc_n_threads = 0; + gc_all_tls_states = NULL; run_finalizers(ct); } @@ -542,6 +556,13 @@ JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 1), f); } +// schedule f(v) to call at the next quiescent interval (aka after the next safepoint/region on all threads) +JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT +{ + assert(!gc_ptr_tag(v, 3)); + jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 3), f); +} + JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT { if (__unlikely(jl_typeis(f, jl_voidpointer_type))) { @@ -562,11 +583,18 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o) arraylist_new(&copied_list, 0); // No need to check the to_finalize list since the user is apparently // still holding a reference to the object - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; - finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i); + int gc_n_threads; + jl_ptls_t* gc_all_tls_states; + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2) + finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i); } finalize_object(&finalizer_list_marked, o, &copied_list, 0); + gc_n_threads = 0; + gc_all_tls_states = NULL; if (copied_list.len > 0) { // This releases the finalizers lock. jl_gc_run_finalizers_in_list(ct, &copied_list); @@ -598,9 +626,11 @@ static void gc_sweep_foreign_objs_in_list(arraylist_t *objs) static void gc_sweep_foreign_objs(void) { - for (int i = 0;i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; - gc_sweep_foreign_objs_in_list(&ptls2->sweep_objs); + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2) + gc_sweep_foreign_objs_in_list(&ptls2->sweep_objs); } } @@ -611,18 +641,19 @@ static int64_t last_gc_total_bytes = 0; // under this limit, but we will go above it rather than halting. #ifdef _P64 typedef uint64_t memsize_t; -#define default_collect_interval (5600*1024*sizeof(void*)) -static size_t max_collect_interval = 1250000000UL; -// Eventually we can expose this to the user/ci. -memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024; +static const size_t default_collect_interval = 5600 * 1024 * sizeof(void*); +static const size_t max_collect_interval = 1250000000UL; +static size_t total_mem; +// We expose this to the user/ci as jl_gc_set_max_memory +static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024; #else typedef uint32_t memsize_t; -#define default_collect_interval (3200*1024*sizeof(void*)) -static size_t max_collect_interval = 500000000UL; +static const size_t default_collect_interval = 3200 * 1024 * sizeof(void*); +static const size_t max_collect_interval = 500000000UL; // Work really hard to stay within 2GB // Alternative is to risk running out of address space // on 32 bit architectures. -memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024; +static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024; #endif // global variables for GC stats @@ -728,9 +759,11 @@ static void gc_sync_cache(jl_ptls_t ptls) JL_NOTSAFEPOINT // No other threads can be running marking at the same time static void gc_sync_all_caches_nolock(jl_ptls_t ptls) { - for (int t_i = 0; t_i < jl_n_threads; t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; - gc_sync_cache_nolock(ptls, &ptls2->gc_cache); + assert(gc_n_threads); + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2) + gc_sync_cache_nolock(ptls, &ptls2->gc_cache); } } @@ -937,8 +970,11 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, static void clear_weak_refs(void) { - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; size_t n, l = ptls2->heap.weak_refs.len; void **lst = ptls2->heap.weak_refs.items; for (n = 0; n < l; n++) { @@ -951,8 +987,11 @@ static void clear_weak_refs(void) static void sweep_weak_refs(void) { - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; size_t n = 0; size_t ndel = 0; size_t l = ptls2->heap.weak_refs.len; @@ -1069,11 +1108,16 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT { gc_time_big_start(); - for (int i = 0;i < jl_n_threads;i++) - sweep_big_list(sweep_full, &jl_all_tls_states[i]->heap.big_objects); + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; + sweep_big_list(sweep_full, &ptls2->heap.big_objects); + } if (sweep_full) { bigval_t **last_next = sweep_big_list(sweep_full, &big_objects_marked); - // Move all survivors from big_objects_marked list to big_objects list. + // Move all survivors from big_objects_marked list to the big_objects list of this thread. if (ptls->heap.big_objects) ptls->heap.big_objects->prev = last_next; *last_next = ptls->heap.big_objects; @@ -1112,8 +1156,12 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT { - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls = jl_all_tls_states[i]; + int gc_n_threads; + jl_ptls_t* gc_all_tls_states; + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls = gc_all_tls_states[i]; if (ptls) { dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval); dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed); @@ -1128,8 +1176,12 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT { - for (int i = 0; i < jl_n_threads; i++) { - jl_ptls_t ptls = jl_all_tls_states[i]; + int gc_n_threads; + jl_ptls_t* gc_all_tls_states; + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls = gc_all_tls_states[i]; if (ptls) { memset(&ptls->gc_num, 0, sizeof(ptls->gc_num)); jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval); @@ -1176,8 +1228,11 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT static void sweep_malloced_arrays(void) JL_NOTSAFEPOINT { gc_time_mallocd_array_start(); - for (int t_i = 0;t_i < jl_n_threads;t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + assert(gc_n_threads); + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) + continue; mallocarray_t *ma = ptls2->heap.mallocarrays; mallocarray_t **pma = &ptls2->heap.mallocarrays; while (ma != NULL) { @@ -1201,11 +1256,10 @@ static void sweep_malloced_arrays(void) JL_NOTSAFEPOINT } // pool allocation -static inline jl_taggedvalue_t *reset_page(const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT +static inline jl_taggedvalue_t *reset_page(jl_ptls_t ptls2, const jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_taggedvalue_t *fl) JL_NOTSAFEPOINT { assert(GC_PAGE_OFFSET >= sizeof(void*)); pg->nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / p->osize; - jl_ptls_t ptls2 = jl_all_tls_states[pg->thread_n]; pg->pool_n = p - ptls2->heap.norm_pools; memset(pg->ages, 0, GC_PAGE_SZ / 8 / p->osize + 1); jl_taggedvalue_t *beg = (jl_taggedvalue_t*)(pg->data + GC_PAGE_OFFSET); @@ -1243,7 +1297,7 @@ static NOINLINE jl_taggedvalue_t *add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT pg->osize = p->osize; pg->ages = (uint8_t*)malloc_s(GC_PAGE_SZ / 8 / p->osize + 1); pg->thread_n = ptls->tid; - jl_taggedvalue_t *fl = reset_page(p, pg, NULL); + jl_taggedvalue_t *fl = reset_page(ptls, p, pg, NULL); p->newpages = fl; return fl; } @@ -1357,7 +1411,8 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t // FIXME - need to do accounting on a per-thread basis // on quick sweeps, keep a few pages empty but allocated for performance if (!sweep_full && lazy_freed_pages <= default_collect_interval / GC_PAGE_SZ) { - jl_taggedvalue_t *begin = reset_page(p, pg, p->newpages); + jl_ptls_t ptls2 = gc_all_tls_states[pg->thread_n]; + jl_taggedvalue_t *begin = reset_page(ptls2, p, pg, p->newpages); p->newpages = begin; begin->next = (jl_taggedvalue_t*)0; lazy_freed_pages++; @@ -1459,7 +1514,7 @@ static inline void sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t *pg { int p_n = pg->pool_n; int t_n = pg->thread_n; - jl_ptls_t ptls2 = jl_all_tls_states[t_n]; + jl_ptls_t ptls2 = gc_all_tls_states[t_n]; jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n]; int osize = pg->osize; pfl[t_n * JL_GC_N_POOLS + p_n] = sweep_page(p, pg, pfl[t_n * JL_GC_N_POOLS + p_n], sweep_full, osize); @@ -1571,9 +1626,9 @@ static void gc_sweep_pool(int sweep_full) gc_time_pool_start(); lazy_freed_pages = 0; - // For the benfit of the analyzer, which doesn't know that jl_n_threads + // For the benefit of the analyzer, which doesn't know that gc_n_threads // doesn't change over the course of this function - size_t n_threads = jl_n_threads; + size_t n_threads = gc_n_threads; // allocate enough space to hold the end of the free list chain // for every thread and pool size @@ -1582,7 +1637,13 @@ static void gc_sweep_pool(int sweep_full) // update metadata of pages that were pointed to by freelist or newpages from a pool // i.e. pages being the current allocation target for (int t_i = 0; t_i < n_threads; t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) { + for (int i = 0; i < JL_GC_N_POOLS; i++) { + pfl[t_i * JL_GC_N_POOLS + i] = NULL; + } + continue; + } for (int i = 0; i < JL_GC_N_POOLS; i++) { jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; jl_taggedvalue_t *last = p->freelist; @@ -1611,6 +1672,9 @@ static void gc_sweep_pool(int sweep_full) // null out terminal pointers of free lists for (int t_i = 0; t_i < n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) + continue; for (int i = 0; i < JL_GC_N_POOLS; i++) { *pfl[t_i * JL_GC_N_POOLS + i] = NULL; } @@ -2378,10 +2442,13 @@ stack: { } else { new_obj = (jl_value_t*)gc_read_stack(&rts[i], offset, lb, ub); - if (gc_ptr_tag(new_obj, 1)) { + if (gc_ptr_tag(new_obj, 3)) { // handle tagged pointers in finalizer list new_obj = gc_ptr_clear_tag(new_obj, 1); + // skip over the finalizer fptr i++; + if (gc_ptr_tag(new_obj, 2)) + continue; } } if (!gc_try_setmark(new_obj, &nptr, &tag, &bits)) @@ -2574,6 +2641,8 @@ finlist: { new_obj = *begin; if (__unlikely(!new_obj)) continue; + if (gc_ptr_tag(new_obj, 2)) + continue; if (gc_ptr_tag(new_obj, 1)) { new_obj = (jl_value_t*)gc_ptr_clear_tag(new_obj, 1); begin++; @@ -2756,7 +2825,7 @@ mark: { int16_t tid = jl_atomic_load_relaxed(&ta->tid); gc_invoke_callbacks(jl_gc_cb_task_scanner_t, gc_cblist_task_scanner, - (ta, tid != -1 && ta == jl_all_tls_states[tid]->root_task)); + (ta, tid != -1 && ta == gc_all_tls_states[tid]->root_task)); import_gc_state(ptls, &sp); } #ifdef COPY_STACKS @@ -2778,7 +2847,7 @@ mark: { if (stkbuf && ta->copy_stack && ta->ptls == NULL) { int16_t tid = jl_atomic_load_relaxed(&ta->tid); assert(tid >= 0); - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = gc_all_tls_states[tid]; ub = (uintptr_t)ptls2->stackbase; lb = ub - ta->copy_stack; offset = (uintptr_t)stkbuf - lb; @@ -2891,19 +2960,26 @@ mark: { static void jl_gc_queue_thread_local(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2) { - jl_value_t *current_task = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task); - gc_mark_queue_obj(gc_cache, sp, current_task); - gc_heap_snapshot_record_root(current_task, "current task"); - gc_mark_queue_obj(gc_cache, sp, ptls2->root_task); - - gc_heap_snapshot_record_root((jl_value_t*)ptls2->root_task, "root task"); - if (ptls2->next_task) { - gc_mark_queue_obj(gc_cache, sp, ptls2->next_task); - gc_heap_snapshot_record_root((jl_value_t*)ptls2->next_task, "next task"); + jl_task_t *task; + task = ptls2->root_task; + if (task) { + gc_mark_queue_obj(gc_cache, sp, task); + gc_heap_snapshot_record_root((jl_value_t*)task, "root task"); + } + task = jl_atomic_load_relaxed(&ptls2->current_task); + if (task) { + gc_mark_queue_obj(gc_cache, sp, task); + gc_heap_snapshot_record_root((jl_value_t*)task, "current task"); } - if (ptls2->previous_task) { // shouldn't be necessary, but no reason not to - gc_mark_queue_obj(gc_cache, sp, ptls2->previous_task); - gc_heap_snapshot_record_root((jl_value_t*)ptls2->previous_task, "previous task"); + task = ptls2->next_task; + if (task) { + gc_mark_queue_obj(gc_cache, sp, task); + gc_heap_snapshot_record_root((jl_value_t*)task, "next task"); + } + task = ptls2->previous_task; + if (task) { // shouldn't be necessary, but no reason not to + gc_mark_queue_obj(gc_cache, sp, task); + gc_heap_snapshot_record_root((jl_value_t*)task, "previous task"); } if (ptls2->previous_exception) { gc_mark_queue_obj(gc_cache, sp, ptls2->previous_exception); @@ -2960,17 +3036,25 @@ static void sweep_finalizer_list(arraylist_t *list) size_t j = 0; for (size_t i=0; i < len; i+=2) { void *v0 = items[i]; - void *v = gc_ptr_clear_tag(v0, 1); + void *v = gc_ptr_clear_tag(v0, 3); if (__unlikely(!v0)) { // remove from this list continue; } void *fin = items[i+1]; - int isfreed = !gc_marked(jl_astaggedvalue(v)->bits.gc); - int isold = (list != &finalizer_list_marked && + int isfreed; + int isold; + if (gc_ptr_tag(v, 2)) { + isfreed = 1; + isold = 0; + } + else { + isfreed = !gc_marked(jl_astaggedvalue(v)->bits.gc); + isold = (list != &finalizer_list_marked && jl_astaggedvalue(v)->bits.gc == GC_OLD_MARKED && jl_astaggedvalue(fin)->bits.gc == GC_OLD_MARKED); + } if (isfreed || isold) { // remove from this list } @@ -3153,11 +3237,18 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) uint64_t start_mark_time = jl_hrtime(); // 1. fix GC bits of objects in the remset. - for (int t_i = 0; t_i < jl_n_threads; t_i++) - jl_gc_premark(jl_all_tls_states[t_i]); + assert(gc_n_threads); + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 != NULL) + jl_gc_premark(ptls2); + } - for (int t_i = 0; t_i < jl_n_threads; t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + assert(gc_n_threads); + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) + continue; // 2.1. mark every object in the `last_remsets` and `rem_binding` jl_gc_queue_remset(gc_cache, &sp, ptls2); // 2.2. mark every thread local root @@ -3194,16 +3285,22 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // mark the object moved to the marked list from the // `finalizer_list` by `sweep_finalizer_list` size_t orig_marked_len = finalizer_list_marked.len; - for (int i = 0;i < jl_n_threads;i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; sweep_finalizer_list(&ptls2->finalizers); } if (prev_sweep_full) { sweep_finalizer_list(&finalizer_list_marked); orig_marked_len = 0; } - for (int i = 0;i < jl_n_threads;i++) { - jl_ptls_t ptls2 = jl_all_tls_states[i]; + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0); } gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, orig_marked_len); @@ -3242,8 +3339,13 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // 5. next collection decision int not_freed_enough = (collection == JL_GC_AUTO) && estimate_freed < (7*(actual_allocd/10)); int nptr = 0; - for (int i = 0;i < jl_n_threads;i++) - nptr += jl_all_tls_states[i]->heap.remset_nptr; + assert(gc_n_threads); + for (int i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2 == NULL) + continue; + nptr += ptls2->heap.remset_nptr; + } // many pointers in the intergen frontier => "quick" mark is not quick int large_frontier = nptr*sizeof(void*) >= default_collect_interval; @@ -3258,9 +3360,16 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) if (large_frontier) { sweep_full = 1; } - if (gc_num.interval > max_collect_interval) { + size_t maxmem = 0; +#ifdef _P64 + // on a big memory machine, increase max_collect_interval to totalmem / nthreads / 2 + maxmem = total_mem / gc_n_threads / 2; +#endif + if (maxmem < max_collect_interval) + maxmem = max_collect_interval; + if (gc_num.interval > maxmem) { sweep_full = 1; - gc_num.interval = max_collect_interval; + gc_num.interval = maxmem; } } @@ -3308,8 +3417,11 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // sweeping is over // 6. if it is a quick sweep, put back the remembered objects in queued state // so that we don't trigger the barrier again on them. - for (int t_i = 0;t_i < jl_n_threads;t_i++) { - jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + assert(gc_n_threads); + for (int t_i = 0; t_i < gc_n_threads; t_i++) { + jl_ptls_t ptls2 = gc_all_tls_states[t_i]; + if (ptls2 == NULL) + continue; if (!sweep_full) { for (int i = 0; i < ptls2->heap.remset->len; i++) { jl_astaggedvalue(ptls2->heap.remset->items[i])->bits.gc = GC_MARKED; @@ -3421,9 +3533,17 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) #endif // Now we are ready to wait for other threads to hit the safepoint, // we can do a few things that doesn't require synchronization. - // TODO (concurrently queue objects) - // no-op for non-threading - jl_gc_wait_for_the_world(); + // + // We must sync here with the tls_lock operations, so that we have a + // seq-cst order between these events now we know that either the new + // thread must run into our safepoint flag or we must observe the + // existence of the thread in the jl_n_threads count. + // + // TODO: concurrently queue objects + jl_fence(); + gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); + gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + jl_gc_wait_for_the_world(gc_all_tls_states, gc_n_threads); JL_PROBE_GC_STOP_THE_WORLD(); uint64_t t1 = jl_hrtime(); @@ -3446,7 +3566,8 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) JL_UNLOCK_NOGC(&finalizers_lock); } - // no-op for non-threading + gc_n_threads = 0; + gc_all_tls_states = NULL; jl_safepoint_end_gc(); jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING); JL_PROBE_GC_END(); @@ -3455,10 +3576,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) // Doing this on all threads is racy (it's impossible to check // or wait for finalizers on other threads without dead lock). if (!ptls->finalizers_inhibited && ptls->locks.len == 0) { - int8_t was_in_finalizer = ptls->in_finalizer; - ptls->in_finalizer = 1; run_finalizers(ct); - ptls->in_finalizer = was_in_finalizer; } JL_PROBE_GC_FINALIZER(); @@ -3473,8 +3591,12 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_mark_sp_t *sp) { jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; - for (size_t i = 0; i < jl_n_threads; i++) - jl_gc_queue_thread_local(gc_cache, sp, jl_all_tls_states[i]); + assert(gc_n_threads); + for (size_t i = 0; i < gc_n_threads; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + if (ptls2) + jl_gc_queue_thread_local(gc_cache, sp, ptls2); + } mark_roots(gc_cache, sp); } @@ -3488,8 +3610,6 @@ JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty) // Per-thread initialization void jl_init_thread_heap(jl_ptls_t ptls) { - if (ptls->tid == 0) - ptls->disable_gc = 1; jl_thread_heap_t *heap = &ptls->heap; jl_gc_pool_t *p = heap->norm_pools; for (int i = 0; i < JL_GC_N_POOLS; i++) { @@ -3520,7 +3640,6 @@ void jl_init_thread_heap(jl_ptls_t ptls) gc_cache->data_stack = (jl_gc_mark_data_t *)malloc_s(init_size * sizeof(jl_gc_mark_data_t)); memset(&ptls->gc_num, 0, sizeof(ptls->gc_num)); - assert(gc_num.interval == default_collect_interval); jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval); } @@ -3545,14 +3664,10 @@ void jl_gc_init(void) gc_num.max_memory = 0; #ifdef _P64 - // on a big memory machine, set max_collect_interval to totalmem / nthreads / 2 - uint64_t total_mem = uv_get_total_memory(); + total_mem = uv_get_total_memory(); uint64_t constrained_mem = uv_get_constrained_memory(); if (constrained_mem != 0) total_mem = constrained_mem; - size_t maxmem = total_mem / jl_n_threads / 2; - if (maxmem > max_collect_interval) - max_collect_interval = maxmem; #endif // We allocate with abandon until we get close to the free memory on the machine. @@ -4007,7 +4122,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p) goto valid_object; } jl_gc_pool_t *pool = - jl_all_tls_states[meta->thread_n]->heap.norm_pools + + gc_all_tls_states[meta->thread_n]->heap.norm_pools + meta->pool_n; if (meta->fl_begin_offset == (uint16_t) -1) { // case 2: this is a page on the newpages list diff --git a/src/gc.h b/src/gc.h index d0a5a5375cb97..7b02df69abbc1 100644 --- a/src/gc.h +++ b/src/gc.h @@ -394,6 +394,8 @@ extern bigval_t *big_objects_marked; extern arraylist_t finalizer_list_marked; extern arraylist_t to_finalize; extern int64_t lazy_freed_pages; +extern int gc_n_threads; +extern jl_ptls_t* gc_all_tls_states; STATIC_INLINE bigval_t *bigval_header(jl_taggedvalue_t *o) JL_NOTSAFEPOINT { diff --git a/src/gf.c b/src/gf.c index abfa0713b9508..4de51e19a88ad 100644 --- a/src/gf.c +++ b/src/gf.c @@ -2070,8 +2070,7 @@ static void record_precompile_statement(jl_method_instance_t *mi) if (!jl_is_method(def)) return; - if (jl_n_threads > 1) - JL_LOCK(&precomp_statement_out_lock); + JL_LOCK(&precomp_statement_out_lock); if (s_precompile == NULL) { const char *t = jl_options.trace_compile; if (!strncmp(t, "stderr", 6)) { @@ -2090,8 +2089,7 @@ static void record_precompile_statement(jl_method_instance_t *mi) if (s_precompile != JL_STDERR) ios_flush(&f_precompile); } - if (jl_n_threads > 1) - JL_UNLOCK(&precomp_statement_out_lock); + JL_UNLOCK(&precomp_statement_out_lock); } jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t world) diff --git a/src/init.c b/src/init.c index b42aea8bd4883..25ad798d040fe 100644 --- a/src/init.c +++ b/src/init.c @@ -201,14 +201,18 @@ void jl_task_frame_noreturn(jl_task_t *ct); JL_DLLEXPORT void jl_atexit_hook(int exitcode) { - if (jl_all_tls_states == NULL) + if (jl_atomic_load_relaxed(&jl_all_tls_states) == NULL) return; - jl_task_t *ct = jl_current_task; + jl_task_t *ct = jl_get_current_task(); // we are about to start tearing everything down, so lets try not to get // upset by the local mess of things when we run the user's _atexit hooks - jl_task_frame_noreturn(ct); + if (ct) + jl_task_frame_noreturn(ct); + + if (ct == NULL && jl_base_module) + ct = container_of(jl_adopt_thread(), jl_task_t, gcstack); if (exitcode == 0) jl_write_compiler_output(); @@ -217,10 +221,16 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode) jl_write_coverage_data(jl_options.output_code_coverage); if (jl_options.malloc_log) jl_write_malloc_log(); + + int8_t old_state; + if (ct) + old_state = jl_gc_unsafe_enter(ct->ptls); + if (jl_base_module) { jl_value_t *f = jl_get_global(jl_base_module, jl_symbol("_atexit")); if (f != NULL) { JL_TRY { + assert(ct); size_t last_age = ct->world_age; ct->world_age = jl_get_world_counter(); jl_apply(&f, 1); @@ -240,7 +250,8 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode) JL_STDOUT = (uv_stream_t*) STDOUT_FILENO; JL_STDERR = (uv_stream_t*) STDERR_FILENO; - jl_gc_run_all_finalizers(ct); + if (ct) + jl_gc_run_all_finalizers(ct); uv_loop_t *loop = jl_global_event_loop(); if (loop != NULL) { @@ -248,7 +259,7 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode) JL_UV_LOCK(); uv_walk(loop, jl_uv_exitcleanup_walk, &queue); struct uv_shutdown_queue_item *item = queue.first; - if (ct != NULL) { + if (ct) { while (item) { JL_TRY { while (item) { @@ -289,11 +300,13 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode) #endif jl_teardown_codegen(); + if (ct) + jl_gc_unsafe_leave(ct->ptls, old_state); } JL_DLLEXPORT void jl_postoutput_hook(void) { - if (jl_all_tls_states == NULL) + if (jl_atomic_load_relaxed(&jl_all_tls_states) == NULL) return; if (jl_base_module) { @@ -772,7 +785,7 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_ if (jl_base_module == NULL) { // nthreads > 1 requires code in Base - jl_n_threads = 1; + jl_atomic_store_relaxed(&jl_n_threads, 1); } jl_start_threads(); diff --git a/src/jl_exported_data.inc b/src/jl_exported_data.inc index eae13a4ff285e..e426f2c1f8b42 100644 --- a/src/jl_exported_data.inc +++ b/src/jl_exported_data.inc @@ -130,7 +130,7 @@ // Data symbols that are defined inside the public libjulia #define JL_EXPORTED_DATA_SYMBOLS(XX) \ XX(jl_n_threadpools, int) \ - XX(jl_n_threads, int) \ + XX(jl_n_threads, _Atomic(int)) \ XX(jl_options, jl_options_t) \ // end of file diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc index 37972c7908690..e2934824bad75 100644 --- a/src/jl_exported_funcs.inc +++ b/src/jl_exported_funcs.inc @@ -3,6 +3,7 @@ #define JL_RUNTIME_EXPORTED_FUNCS(XX) \ XX(jl_active_task_stack) \ XX(jl_add_standard_imports) \ + XX(jl_adopt_thread) \ XX(jl_alignment) \ XX(jl_alloc_array_1d) \ XX(jl_alloc_array_2d) \ @@ -150,6 +151,7 @@ XX(jl_gc_add_finalizer) \ XX(jl_gc_add_finalizer_th) \ XX(jl_gc_add_ptr_finalizer) \ + XX(jl_gc_add_quiescent) \ XX(jl_gc_allocobj) \ XX(jl_gc_alloc_0w) \ XX(jl_gc_alloc_1w) \ diff --git a/src/jlapi.c b/src/jlapi.c index d1fb1e5aacf25..53a6c9b3c6859 100644 --- a/src/jlapi.c +++ b/src/jlapi.c @@ -96,9 +96,15 @@ JL_DLLEXPORT void jl_init_with_image__threading(const char *julia_bindir, jl_init_with_image(julia_bindir, image_relative_path); } +static void _jl_exception_clear(jl_task_t *ct) JL_NOTSAFEPOINT +{ + ct->ptls->previous_exception = NULL; +} + JL_DLLEXPORT jl_value_t *jl_eval_string(const char *str) { jl_value_t *r; + jl_task_t *ct = jl_current_task; JL_TRY { const char filename[] = "none"; jl_value_t *ast = jl_parse_all(str, strlen(str), @@ -106,10 +112,10 @@ JL_DLLEXPORT jl_value_t *jl_eval_string(const char *str) JL_GC_PUSH1(&ast); r = jl_toplevel_eval_in(jl_main_module, ast); JL_GC_POP(); - jl_exception_clear(); + _jl_exception_clear(ct); } JL_CATCH { - jl_current_task->ptls->previous_exception = jl_current_exception(); + ct->ptls->previous_exception = jl_current_exception(); r = NULL; } return r; @@ -128,7 +134,7 @@ JL_DLLEXPORT jl_value_t *jl_exception_occurred(void) JL_DLLEXPORT void jl_exception_clear(void) { - jl_current_task->ptls->previous_exception = NULL; + _jl_exception_clear(jl_current_task); } // get the name of a type as a string @@ -181,7 +187,7 @@ JL_DLLEXPORT jl_value_t *jl_call(jl_function_t *f, jl_value_t **args, uint32_t n v = jl_apply(argv, nargs); ct->world_age = last_age; JL_GC_POP(); - jl_exception_clear(); + _jl_exception_clear(ct); } JL_CATCH { ct->ptls->previous_exception = jl_current_exception(); @@ -201,7 +207,7 @@ JL_DLLEXPORT jl_value_t *jl_call0(jl_function_t *f) v = jl_apply_generic(f, NULL, 0); ct->world_age = last_age; JL_GC_POP(); - jl_exception_clear(); + _jl_exception_clear(ct); } JL_CATCH { ct->ptls->previous_exception = jl_current_exception(); @@ -224,7 +230,7 @@ JL_DLLEXPORT jl_value_t *jl_call1(jl_function_t *f, jl_value_t *a) v = jl_apply(argv, 2); ct->world_age = last_age; JL_GC_POP(); - jl_exception_clear(); + _jl_exception_clear(ct); } JL_CATCH { ct->ptls->previous_exception = jl_current_exception(); @@ -248,7 +254,7 @@ JL_DLLEXPORT jl_value_t *jl_call2(jl_function_t *f, jl_value_t *a, jl_value_t *b v = jl_apply(argv, 3); ct->world_age = last_age; JL_GC_POP(); - jl_exception_clear(); + _jl_exception_clear(ct); } JL_CATCH { ct->ptls->previous_exception = jl_current_exception(); @@ -261,6 +267,7 @@ JL_DLLEXPORT jl_value_t *jl_call3(jl_function_t *f, jl_value_t *a, jl_value_t *b, jl_value_t *c) { jl_value_t *v; + jl_task_t *ct = jl_current_task; JL_TRY { jl_value_t **argv; JL_GC_PUSHARGS(argv, 4); @@ -268,16 +275,15 @@ JL_DLLEXPORT jl_value_t *jl_call3(jl_function_t *f, jl_value_t *a, argv[1] = a; argv[2] = b; argv[3] = c; - jl_task_t *ct = jl_current_task; size_t last_age = ct->world_age; ct->world_age = jl_get_world_counter(); v = jl_apply(argv, 4); ct->world_age = last_age; JL_GC_POP(); - jl_exception_clear(); + _jl_exception_clear(ct); } JL_CATCH { - jl_current_task->ptls->previous_exception = jl_current_exception(); + ct->ptls->previous_exception = jl_current_exception(); v = NULL; } return v; @@ -560,8 +566,8 @@ static NOINLINE int true_main(int argc, char *argv[]) (jl_function_t*)jl_get_global(jl_base_module, jl_symbol("_start")) : NULL; if (start_client) { + jl_task_t *ct = jl_current_task; JL_TRY { - jl_task_t *ct = jl_current_task; size_t last_age = ct->world_age; ct->world_age = jl_get_world_counter(); jl_apply(&start_client, 1); diff --git a/src/julia.h b/src/julia.h index b0275f02df2e2..163eda6e40198 100644 --- a/src/julia.h +++ b/src/julia.h @@ -904,6 +904,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t); JL_DLLEXPORT void jl_gc_add_finalizer(jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f) JL_NOTSAFEPOINT; +JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_finalize(jl_value_t *o); JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value); JL_DLLEXPORT jl_value_t *jl_gc_alloc_0w(void); @@ -1674,7 +1675,7 @@ JL_DLLEXPORT jl_sym_t *jl_get_UNAME(void) JL_NOTSAFEPOINT; JL_DLLEXPORT jl_sym_t *jl_get_ARCH(void) JL_NOTSAFEPOINT; JL_DLLEXPORT jl_value_t *jl_get_libllvm(void) JL_NOTSAFEPOINT; extern JL_DLLIMPORT int jl_n_threadpools; -extern JL_DLLIMPORT int jl_n_threads; +extern JL_DLLIMPORT _Atomic(int) jl_n_threads; extern JL_DLLIMPORT int *jl_n_threads_per_pool; // environment entries @@ -1756,6 +1757,7 @@ JL_DLLEXPORT void jl_atexit_hook(int status); JL_DLLEXPORT void jl_postoutput_hook(void); JL_DLLEXPORT void JL_NORETURN jl_exit(int status); JL_DLLEXPORT const char *jl_pathname_for_handle(void *handle); +JL_DLLEXPORT jl_gcframe_t **jl_adopt_thread(void); JL_DLLEXPORT int jl_deserialize_verify_header(ios_t *s); JL_DLLEXPORT void jl_preload_sysimg_so(const char *fname); diff --git a/src/julia_gcext.h b/src/julia_gcext.h index 631d9c2910330..6523198474771 100644 --- a/src/julia_gcext.h +++ b/src/julia_gcext.h @@ -79,7 +79,7 @@ JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent, // will result in the custom sweep function actually being called. // This must be done at most once per object and should usually be // done right after allocating the object. -JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t * bj); +JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *bj); // The following functions enable support for conservative marking. This // functionality allows the user to determine if a machine word can be diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 8847a3e34be51..5011d900ef99e 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2706,12 +2706,12 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(std::vector &Colors, State // Insert GC frame stores PlaceGCFrameStores(S, AllocaSlot - 2, Colors, gcframe); // Insert GCFrame pops - for(Function::iterator I = F->begin(), E = F->end(); I != E; ++I) { - if (isa(I->getTerminator())) { + for (auto &BB : *F) { + if (isa(BB.getTerminator())) { auto popGcframe = CallInst::Create( getOrDeclare(jl_intrinsics::popGCFrame), {gcframe}); - popGcframe->insertBefore(I->getTerminator()); + popGcframe->insertBefore(BB.getTerminator()); } } } diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index f0c0c6ee77b44..3b55339984516 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -23,7 +23,7 @@ JuliaPassContext::JuliaPassContext() tbaa_gcframe(nullptr), tbaa_tag(nullptr), - pgcstack_getter(nullptr), gc_flush_func(nullptr), + pgcstack_getter(nullptr), adoptthread_func(nullptr), gc_flush_func(nullptr), gc_preserve_begin_func(nullptr), gc_preserve_end_func(nullptr), pointer_from_objref_func(nullptr), alloc_obj_func(nullptr), typeof_func(nullptr), write_barrier_func(nullptr), @@ -44,6 +44,7 @@ void JuliaPassContext::initFunctions(Module &M) tbaa_tag = tbaa_make_child_with_context(llvmctx, "jtbaa_tag", tbaa_data_scalar).first; pgcstack_getter = M.getFunction("julia.get_pgcstack"); + adoptthread_func = M.getFunction("julia.get_pgcstack_or_new"); gc_flush_func = M.getFunction("julia.gcroot_flush"); gc_preserve_begin_func = M.getFunction("llvm.julia.gc_preserve_begin"); gc_preserve_end_func = M.getFunction("llvm.julia.gc_preserve_end"); @@ -70,10 +71,13 @@ void JuliaPassContext::initAll(Module &M) llvm::CallInst *JuliaPassContext::getPGCstack(llvm::Function &F) const { - for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); - pgcstack_getter && I != E; ++I) { - if (CallInst *callInst = dyn_cast(&*I)) { - if (callInst->getCalledOperand() == pgcstack_getter) { + if (!pgcstack_getter && !adoptthread_func) + return nullptr; + for (auto &I : F.getEntryBlock()) { + if (CallInst *callInst = dyn_cast(&I)) { + Value *callee = callInst->getCalledOperand(); + if ((pgcstack_getter && callee == pgcstack_getter) || + (adoptthread_func && callee == adoptthread_func)) { return callInst; } } diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h index 64d5dc00e2c5b..68f6efe42be6d 100644 --- a/src/llvm-pass-helpers.h +++ b/src/llvm-pass-helpers.h @@ -50,6 +50,7 @@ struct JuliaPassContext { // Intrinsics. llvm::Function *pgcstack_getter; + llvm::Function *adoptthread_func; llvm::Function *gc_flush_func; llvm::Function *gc_preserve_begin_func; llvm::Function *gc_preserve_end_func; diff --git a/src/llvm-ptls.cpp b/src/llvm-ptls.cpp index e948e1c1a10bc..ad516d4ceb010 100644 --- a/src/llvm-ptls.cpp +++ b/src/llvm-ptls.cpp @@ -35,19 +35,19 @@ typedef Instruction TerminatorInst; namespace { struct LowerPTLS { - LowerPTLS(bool imaging_mode=false) - : imaging_mode(imaging_mode) + LowerPTLS(Module &M, bool imaging_mode=false) + : imaging_mode(imaging_mode), M(&M) {} - bool runOnModule(Module &M, bool *CFGModified); + bool run(bool *CFGModified); private: const bool imaging_mode; Module *M; - Function *pgcstack_getter; - MDNode *tbaa_const; - FunctionType *FT_pgcstack_getter; - PointerType *T_pgcstack_getter; - PointerType *T_pppjlvalue; + MDNode *tbaa_const{nullptr}; + MDNode *tbaa_gcframe{nullptr}; + FunctionType *FT_pgcstack_getter{nullptr}; + PointerType *T_pgcstack_getter{nullptr}; + PointerType *T_pppjlvalue{nullptr}; GlobalVariable *pgcstack_func_slot{nullptr}; GlobalVariable *pgcstack_key_slot{nullptr}; GlobalVariable *pgcstack_offset{nullptr}; @@ -55,7 +55,7 @@ struct LowerPTLS { Instruction *emit_pgcstack_tp(Value *offset, Instruction *insertBefore) const; template T *add_comdat(T *G) const; GlobalVariable *create_aliased_global(Type *T, StringRef name) const; - void fix_pgcstack_use(CallInst *pgcstack, bool *CFGModified); + void fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter, bool or_new, bool *CFGModified); }; void LowerPTLS::set_pgcstack_attrs(CallInst *pgcstack) const @@ -159,19 +159,77 @@ inline T *LowerPTLS::add_comdat(T *G) const return G; } -void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, bool *CFGModified) +void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, Function *pgcstack_getter, bool or_new, bool *CFGModified) { if (pgcstack->use_empty()) { pgcstack->eraseFromParent(); return; } + if (or_new) { + // pgcstack(); + // if (pgcstack != nullptr) + // last_gc_state = emit_gc_unsafe_enter(ctx); + // phi = pgcstack; // fast + // else + // last_gc_state = gc_safe; + // phi = adopt(); // slow + // use phi; + // if (!retboxed) + // foreach(retinst) + // emit_gc_unsafe_leave(ctx, last_gc_state); + auto phi = PHINode::Create(pgcstack->getType(), 2, ""); + phi->insertAfter(pgcstack); + pgcstack->replaceAllUsesWith(phi); + MDBuilder MDB(pgcstack->getContext()); + SmallVector Weights{9, 1}; + TerminatorInst *fastTerm; + TerminatorInst *slowTerm; + auto cmp = new ICmpInst(phi, CmpInst::ICMP_NE, pgcstack, Constant::getNullValue(pgcstack->getType())); + SplitBlockAndInsertIfThenElse(cmp, phi, &fastTerm, &slowTerm, + MDB.createBranchWeights(Weights)); + if (CFGModified) + *CFGModified = true; + // emit slow branch code + CallInst *adopt = cast(pgcstack->clone()); + Function *adoptFunc = M->getFunction(XSTR(jl_adopt_thread)); + if (adoptFunc == NULL) { + adoptFunc = Function::Create(pgcstack_getter->getFunctionType(), + pgcstack_getter->getLinkage(), pgcstack_getter->getAddressSpace(), + XSTR(jl_adopt_thread), M); + adoptFunc->copyAttributesFrom(pgcstack_getter); + adoptFunc->copyMetadata(pgcstack_getter, 0); + } + adopt->setCalledFunction(adoptFunc); + adopt->insertBefore(slowTerm); + phi->addIncoming(adopt, slowTerm->getParent()); + // emit fast branch code + IRBuilder<> builder(fastTerm->getParent()); + fastTerm->removeFromParent(); + MDNode *tbaa = tbaa_gcframe; + Value *prior = emit_gc_unsafe_enter(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, pgcstack), tbaa)); + builder.Insert(fastTerm); + phi->addIncoming(pgcstack, fastTerm->getParent()); + // emit pre-return cleanup + if (CountTrackedPointers(pgcstack->getParent()->getParent()->getReturnType()).count == 0) { + auto last_gc_state = PHINode::Create(Type::getInt8Ty(pgcstack->getContext()), 2, "", phi); + // if we called jl_adopt_thread, we must end this cfunction back in the safe-state + last_gc_state->addIncoming(ConstantInt::get(Type::getInt8Ty(M->getContext()), JL_GC_STATE_SAFE), slowTerm->getParent()); + last_gc_state->addIncoming(prior, fastTerm->getParent()); + for (auto &BB : *pgcstack->getParent()->getParent()) { + if (isa(BB.getTerminator())) { + IRBuilder<> builder(BB.getTerminator()); + emit_gc_unsafe_leave(builder, get_current_ptls_from_task(builder, get_current_task_from_pgcstack(builder, phi), tbaa), last_gc_state); + } + } + } + } if (imaging_mode) { if (jl_tls_elf_support) { // if (offset != 0) - // pgcstack = tp + offset; + // pgcstack = tp + offset; // fast // else - // pgcstack = getter(); + // pgcstack = getter(); // slow auto offset = new LoadInst(getSizeTy(pgcstack->getContext()), pgcstack_offset, "", false, pgcstack); offset->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const); offset->setMetadata(llvm::LLVMContext::MD_invariant_load, MDNode::get(pgcstack->getContext(), None)); @@ -184,7 +242,7 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, bool *CFGModified) SplitBlockAndInsertIfThenElse(cmp, pgcstack, &fastTerm, &slowTerm, MDB.createBranchWeights(Weights)); if (CFGModified) - *CFGModified = true; + *CFGModified = true; auto fastTLS = emit_pgcstack_tp(offset, fastTerm); auto phi = PHINode::Create(T_pppjlvalue, 2, "", pgcstack); @@ -248,37 +306,44 @@ void LowerPTLS::fix_pgcstack_use(CallInst *pgcstack, bool *CFGModified) } } -bool LowerPTLS::runOnModule(Module &_M, bool *CFGModified) +bool LowerPTLS::run(bool *CFGModified) { - M = &_M; - pgcstack_getter = M->getFunction("julia.get_pgcstack"); - if (!pgcstack_getter) - return false; + bool need_init = true; + auto runOnGetter = [&](bool or_new) { + Function *pgcstack_getter = M->getFunction(or_new ? "julia.get_pgcstack_or_new" : "julia.get_pgcstack"); + if (!pgcstack_getter) + return false; - tbaa_const = tbaa_make_child_with_context(_M.getContext(), "jtbaa_const", nullptr, true).first; + if (need_init) { + tbaa_const = tbaa_make_child_with_context(M->getContext(), "jtbaa_const", nullptr, true).first; + tbaa_gcframe = tbaa_make_child_with_context(M->getContext(), "jtbaa_gcframe").first; - FT_pgcstack_getter = pgcstack_getter->getFunctionType(); + FT_pgcstack_getter = pgcstack_getter->getFunctionType(); #if defined(_OS_DARWIN_) - assert(sizeof(jl_pgcstack_key_t) == sizeof(uintptr_t)); - FT_pgcstack_getter = FunctionType::get(FT_pgcstack_getter->getReturnType(), {getSizeTy(_M.getContext())}, false); + assert(sizeof(jl_pgcstack_key_t) == sizeof(uintptr_t)); + FT_pgcstack_getter = FunctionType::get(FT_pgcstack_getter->getReturnType(), {getSizeTy(M->getContext())}, false); #endif - T_pgcstack_getter = FT_pgcstack_getter->getPointerTo(); - T_pppjlvalue = cast(FT_pgcstack_getter->getReturnType()); - if (imaging_mode) { - pgcstack_func_slot = create_aliased_global(T_pgcstack_getter, "jl_pgcstack_func_slot"); - pgcstack_key_slot = create_aliased_global(getSizeTy(_M.getContext()), "jl_pgcstack_key_slot"); // >= sizeof(jl_pgcstack_key_t) - pgcstack_offset = create_aliased_global(getSizeTy(_M.getContext()), "jl_tls_offset"); - } + T_pgcstack_getter = FT_pgcstack_getter->getPointerTo(); + T_pppjlvalue = cast(FT_pgcstack_getter->getReturnType()); + if (imaging_mode) { + pgcstack_func_slot = create_aliased_global(T_pgcstack_getter, "jl_pgcstack_func_slot"); + pgcstack_key_slot = create_aliased_global(getSizeTy(M->getContext()), "jl_pgcstack_key_slot"); // >= sizeof(jl_pgcstack_key_t) + pgcstack_offset = create_aliased_global(getSizeTy(M->getContext()), "jl_tls_offset"); + } + need_init = false; + } - for (auto it = pgcstack_getter->user_begin(); it != pgcstack_getter->user_end();) { - auto call = cast(*it); - ++it; - assert(call->getCalledOperand() == pgcstack_getter); - fix_pgcstack_use(call, CFGModified); - } - assert(pgcstack_getter->use_empty()); - pgcstack_getter->eraseFromParent(); - return true; + for (auto it = pgcstack_getter->user_begin(); it != pgcstack_getter->user_end();) { + auto call = cast(*it); + ++it; + assert(call->getCalledOperand() == pgcstack_getter); + fix_pgcstack_use(call, pgcstack_getter, or_new, CFGModified); + } + assert(pgcstack_getter->use_empty()); + pgcstack_getter->eraseFromParent(); + return true; + }; + return runOnGetter(false) + runOnGetter(true); } struct LowerPTLSLegacy: public ModulePass { @@ -290,8 +355,8 @@ struct LowerPTLSLegacy: public ModulePass { bool imaging_mode; bool runOnModule(Module &M) override { - LowerPTLS lower(imaging_mode); - return lower.runOnModule(M, nullptr); + LowerPTLS lower(M, imaging_mode); + return lower.run(nullptr); } }; @@ -304,9 +369,9 @@ static RegisterPass X("LowerPTLS", "LowerPTLS Pass", } // anonymous namespace PreservedAnalyses LowerPTLSPass::run(Module &M, ModuleAnalysisManager &AM) { - LowerPTLS lower(imaging_mode); + LowerPTLS lower(M, imaging_mode); bool CFGModified = false; - if (lower.runOnModule(M, &CFGModified)) { + if (lower.run(&CFGModified)) { if (CFGModified) { return PreservedAnalyses::none(); } else { diff --git a/src/partr.c b/src/partr.c index eeb0d0f456d97..ec6bbe3e5720a 100644 --- a/src/partr.c +++ b/src/partr.c @@ -26,6 +26,9 @@ static const int16_t not_sleeping = 0; // it is acceptable for the thread to be sleeping. static const int16_t sleeping = 1; +// this thread is dead. +static const int16_t sleeping_like_the_dead JL_UNUSED = 2; + // invariant: No thread is ever asleep unless sleep_check_state is sleeping (or we have a wakeup signal pending). // invariant: Any particular thread is not asleep unless that thread's sleep_check_state is sleeping. // invariant: The transition of a thread state to sleeping must be followed by a check that there wasn't work pending for it. @@ -182,7 +185,7 @@ static int sleep_check_after_threshold(uint64_t *start_cycles) static int wake_thread(int16_t tid) { - jl_ptls_t other = jl_all_tls_states[tid]; + jl_ptls_t other = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; int8_t state = sleeping; if (jl_atomic_load_relaxed(&other->sleep_check_state) == sleeping) { @@ -229,7 +232,7 @@ JL_DLLEXPORT void jl_wakeup_thread(int16_t tid) if (wake_thread(tid)) { // check if we need to notify uv_run too jl_fence(); - jl_ptls_t other = jl_all_tls_states[tid]; + jl_ptls_t other = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; jl_task_t *tid_task = jl_atomic_load_relaxed(&other->current_task); // now that we have changed the thread to not-sleeping, ensure that // either it has not yet acquired the libuv lock, or that it will @@ -244,7 +247,8 @@ JL_DLLEXPORT void jl_wakeup_thread(int16_t tid) // in the future, we might want to instead wake some fraction of threads, // and let each of those wake additional threads if they find work int anysleep = 0; - for (tid = 0; tid < jl_n_threads; tid++) { + int nthreads = jl_atomic_load_acquire(&jl_n_threads); + for (tid = 0; tid < nthreads; tid++) { if (tid != self) anysleep |= wake_thread(tid); } diff --git a/src/safepoint.c b/src/safepoint.c index b2feccf74e068..1ff26d616a5d8 100644 --- a/src/safepoint.c +++ b/src/safepoint.c @@ -111,10 +111,6 @@ void jl_safepoint_init(void) int jl_safepoint_start_gc(void) { - if (jl_n_threads == 1) { - jl_atomic_store_relaxed(&jl_gc_running, 1); - return 1; - } // The thread should have set this already assert(jl_atomic_load_relaxed(&jl_current_task->ptls->gc_state) == JL_GC_STATE_WAITING); uv_mutex_lock(&safepoint_lock); @@ -137,10 +133,6 @@ int jl_safepoint_start_gc(void) void jl_safepoint_end_gc(void) { assert(jl_atomic_load_relaxed(&jl_gc_running)); - if (jl_n_threads == 1) { - jl_atomic_store_relaxed(&jl_gc_running, 0); - return; - } uv_mutex_lock(&safepoint_lock); // Need to reset the page protection before resetting the flag since // the thread will trigger a segfault immediately after returning from diff --git a/src/signal-handling.c b/src/signal-handling.c index 3b1e4934e764b..5154a9f563f30 100644 --- a/src/signal-handling.c +++ b/src/signal-handling.c @@ -182,14 +182,10 @@ static int *profile_get_randperm(int size) JL_DLLEXPORT int jl_profile_is_buffer_full(void) { - // declare buffer full if there isn't enough room to take samples across all threads - #if defined(_OS_WINDOWS_) - uint64_t nthreads = 1; // windows only profiles the main thread - #else - uint64_t nthreads = jl_n_threads; - #endif - // the `+ 6` is for the two block terminators `0` plus 4 metadata entries - return bt_size_cur + (((JL_BT_MAX_ENTRY_SIZE + 1) + 6) * nthreads) > bt_size_max; + // Declare buffer full if there isn't enough room to sample even just the + // thread metadata and one max-sized frame. The `+ 6` is for the two block + // terminator `0`'s plus the 4 metadata entries. + return bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > bt_size_max; } static uint64_t jl_last_sigint_trigger = 0; diff --git a/src/signals-mach.c b/src/signals-mach.c index 5a1816a80f2b2..edc2b42215f67 100644 --- a/src/signals-mach.c +++ b/src/signals-mach.c @@ -50,7 +50,7 @@ void jl_mach_gc_end(void) uintptr_t item = (uintptr_t)suspended_threads.items[i]; int16_t tid = (int16_t)item; int8_t gc_state = (int8_t)(item >> 8); - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; jl_atomic_store_release(&ptls2->gc_state, gc_state); thread_resume(pthread_mach_thread_np(ptls2->system_id)); } @@ -119,7 +119,8 @@ static void allocate_mach_handler() if (_keymgr_set_lockmode_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, NM_ALLOW_RECURSION)) jl_error("_keymgr_set_lockmode_processwide_ptr failed"); - arraylist_new(&suspended_threads, jl_n_threads); + int16_t nthreads = jl_atomic_load_acquire(&jl_n_threads); + arraylist_new(&suspended_threads, nthreads); // we will resize later (inside safepoint_lock), if needed pthread_t thread; pthread_attr_t attr; kern_return_t ret; @@ -221,7 +222,7 @@ static void jl_throw_in_thread(int tid, mach_port_t thread, jl_value_t *exceptio host_thread_state_t state; kern_return_t ret = thread_get_state(thread, MACH_THREAD_STATE, (thread_state_t)&state, &count); HANDLE_MACH_ERROR("thread_get_state", ret); - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; if (!jl_get_safe_restore()) { assert(exception); ptls2->bt_size = @@ -265,8 +266,9 @@ kern_return_t catch_mach_exception_raise( #endif int16_t tid; jl_ptls_t ptls2 = NULL; - for (tid = 0; tid < jl_n_threads; tid++) { - jl_ptls_t _ptls2 = jl_all_tls_states[tid]; + int nthreads = jl_atomic_load_acquire(&jl_n_threads); + for (tid = 0; tid < nthreads; tid++) { + jl_ptls_t _ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; if (pthread_mach_thread_np(_ptls2->system_id) == thread) { ptls2 = _ptls2; break; @@ -381,9 +383,15 @@ static void attach_exception_port(thread_port_t thread, int segv_only) HANDLE_MACH_ERROR("thread_set_exception_ports", ret); } -static void jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) +static int jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) { - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + if (ptls2 == NULL) // this thread is not alive + return 0; + jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL; + if (ct2 == NULL) // this thread is already dead + return 0; + mach_port_t thread = pthread_mach_thread_np(ptls2->system_id); kern_return_t ret = thread_suspend(thread); @@ -395,18 +403,22 @@ static void jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx) // Get the state of the suspended thread ret = thread_get_state(thread, MACH_THREAD_STATE, (thread_state_t)ctx, &count); + return 1; } static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx) { static host_thread_state_t state; - jl_thread_suspend_and_get_state2(tid, &state); + if (!jl_thread_suspend_and_get_state2(tid, &state)) { + *ctx = NULL; + return; + } *ctx = (unw_context_t*)&state; } static void jl_thread_resume(int tid, int sig) { - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; mach_port_t thread = pthread_mach_thread_np(ptls2->system_id); kern_return_t ret = thread_resume(thread); HANDLE_MACH_ERROR("thread_resume", ret); @@ -416,7 +428,7 @@ static void jl_thread_resume(int tid, int sig) // or if SIGINT happens too often. static void jl_try_deliver_sigint(void) { - jl_ptls_t ptls2 = jl_all_tls_states[0]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; mach_port_t thread = pthread_mach_thread_np(ptls2->system_id); kern_return_t ret = thread_suspend(thread); @@ -452,11 +464,12 @@ CFI_NORETURN static void jl_exit_thread0(int exitstate, jl_bt_element_t *bt_data, size_t bt_size) { - jl_ptls_t ptls2 = jl_all_tls_states[0]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; mach_port_t thread = pthread_mach_thread_np(ptls2->system_id); host_thread_state_t state; - jl_thread_suspend_and_get_state2(0, &state); + if (!jl_thread_suspend_and_get_state2(0, &state)) + return; unw_context_t *uc = (unw_context_t*)&state; // This aborts `sleep` and other syscalls. @@ -608,8 +621,9 @@ void *mach_profile_listener(void *arg) // (so that thread zero gets notified last) int keymgr_locked = jl_lock_profile_mach(0); - int *randperm = profile_get_randperm(jl_n_threads); - for (int idx = jl_n_threads; idx-- > 0; ) { + int nthreads = jl_atomic_load_acquire(&jl_n_threads); + int *randperm = profile_get_randperm(nthreads); + for (int idx = nthreads; idx-- > 0; ) { // Stop the threads in the random or reverse round-robin order. int i = randperm[idx]; // if there is no space left, break early @@ -621,7 +635,8 @@ void *mach_profile_listener(void *arg) if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) _dyld_atfork_prepare(); // briefly acquire the dlsym lock host_thread_state_t state; - jl_thread_suspend_and_get_state2(i, &state); + if (!jl_thread_suspend_and_get_state2(i, &state)) + continue; unw_context_t *uc = (unw_context_t*)&state; if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL) _dyld_atfork_parent(); // quickly release the dlsym lock @@ -660,12 +675,12 @@ void *mach_profile_listener(void *arg) #else bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL); #endif - jl_ptls_t ptls = jl_all_tls_states[i]; + jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[i]; // store threadid but add 1 as 0 is preserved to indicate end of block bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; - // store task id + // store task id (never null) bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); // store cpu cycle clock diff --git a/src/signals-unix.c b/src/signals-unix.c index 8fad82e0e40dc..5fd9b3c44587e 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -372,7 +372,14 @@ static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx) clock_gettime(CLOCK_REALTIME, &ts); ts.tv_sec += 1; pthread_mutex_lock(&in_signal_lock); - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; + jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL; + if (ct2 == NULL) { + // this thread is not alive or already dead + *ctx = NULL; + pthread_mutex_unlock(&in_signal_lock); + return; + } jl_atomic_store_release(&ptls2->signal_request, 1); pthread_kill(ptls2->system_id, SIGUSR2); // wait for thread to acknowledge @@ -404,7 +411,7 @@ static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx) static void jl_thread_resume(int tid, int sig) { - jl_ptls_t ptls2 = jl_all_tls_states[tid]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid]; jl_atomic_store_release(&ptls2->signal_request, sig == -1 ? 3 : 1); pthread_cond_broadcast(&exit_signal_cond); pthread_cond_wait(&signal_caught_cond, &in_signal_lock); // wait for thread to acknowledge @@ -420,7 +427,7 @@ static void jl_thread_resume(int tid, int sig) // or if SIGINT happens too often. static void jl_try_deliver_sigint(void) { - jl_ptls_t ptls2 = jl_all_tls_states[0]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; jl_safepoint_enable_sigint(); jl_wake_libuv(); jl_atomic_store_release(&ptls2->signal_request, 2); @@ -451,7 +458,7 @@ CFI_NORETURN static void jl_exit_thread0(int state, jl_bt_element_t *bt_data, size_t bt_size) { - jl_ptls_t ptls2 = jl_all_tls_states[0]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; if (thread0_exit_count <= 1) { unw_context_t *signal_context; jl_thread_suspend_and_get_state(0, &signal_context); @@ -701,7 +708,7 @@ void trigger_profile_peek(void) if (bt_size_max == 0){ // If the buffer hasn't been initialized, initialize with default size // Keep these values synchronized with Profile.default_init() - if (jl_profile_init(10000000 * jl_n_threads, 1000000) == -1){ + if (jl_profile_init(10000000, 1000000) == -1) { jl_safe_printf("ERROR: could not initialize the profile buffer"); return; } @@ -831,6 +838,7 @@ static void *signal_listener(void *arg) } #endif + int nthreads = jl_atomic_load_acquire(&jl_n_threads); bt_size = 0; #if !defined(JL_DISABLE_LIBUNWIND) unw_context_t *signal_context; @@ -840,8 +848,8 @@ static void *signal_listener(void *arg) jl_lock_profile(); int *randperm; if (profile) - randperm = profile_get_randperm(jl_n_threads); - for (int idx = jl_n_threads; idx-- > 0; ) { + randperm = profile_get_randperm(nthreads); + for (int idx = nthreads; idx-- > 0; ) { // Stop the threads in the random or reverse round-robin order. int i = profile ? randperm[idx] : idx; // notify thread to stop @@ -853,7 +861,7 @@ static void *signal_listener(void *arg) // this part must be signal-handler safe if (critical) { bt_size += rec_backtrace_ctx(bt_data + bt_size, - JL_MAX_BT_SIZE / jl_n_threads - 1, + JL_MAX_BT_SIZE / nthreads - 1, signal_context, NULL); bt_data[bt_size++].uintptr = 0; } @@ -880,12 +888,12 @@ static void *signal_listener(void *arg) } jl_set_safe_restore(old_buf); - jl_ptls_t ptls2 = jl_all_tls_states[i]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[i]; // store threadid but add 1 as 0 is preserved to indicate end of block bt_data_prof[bt_size_cur++].uintptr = ptls2->tid + 1; - // store task id + // store task id (never null) bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task); // store cpu cycle clock @@ -927,11 +935,11 @@ static void *signal_listener(void *arg) else { #ifndef SIGINFO // SIGINFO already prints this automatically int nrunning = 0; - for (int idx = jl_n_threads; idx-- > 0; ) { - jl_ptls_t ptls2 = jl_all_tls_states[idx]; + for (int idx = nthreads; idx-- > 0; ) { + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[idx]; nrunning += !jl_atomic_load_relaxed(&ptls2->sleep_check_state); } - jl_safe_printf("\ncmd: %s %d running %d of %d\n", jl_options.julia_bin ? jl_options.julia_bin : "julia", uv_os_getpid(), nrunning, jl_n_threads); + jl_safe_printf("\ncmd: %s %d running %d of %d\n", jl_options.julia_bin ? jl_options.julia_bin : "julia", uv_os_getpid(), nrunning, nthreads); #endif jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig)); diff --git a/src/signals-win.c b/src/signals-win.c index 178a7463b8d50..83e92ff400e1d 100644 --- a/src/signals-win.c +++ b/src/signals-win.c @@ -165,7 +165,7 @@ HANDLE hMainThread = INVALID_HANDLE_VALUE; // Try to throw the exception in the master thread. static void jl_try_deliver_sigint(void) { - jl_ptls_t ptls2 = jl_all_tls_states[0]; + jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; jl_lock_profile(); jl_safepoint_enable_sigint(); jl_wake_libuv(); @@ -362,12 +362,12 @@ static DWORD WINAPI profile_bt( LPVOID lparam ) bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, &ctxThread, NULL); - jl_ptls_t ptls = jl_all_tls_states[0]; // given only profiling hMainThread + jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread // store threadid but add 1 as 0 is preserved to indicate end of block bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1; - // store task id + // store task id (never null) bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task); // store cpu cycle clock diff --git a/src/task.c b/src/task.c index a1adb704695a7..5c7c521f89b09 100644 --- a/src/task.c +++ b/src/task.c @@ -331,7 +331,8 @@ JL_DLLEXPORT void *jl_task_stack_buffer(jl_task_t *task, size_t *size, int *ptid { size_t off = 0; #ifndef _OS_WINDOWS_ - if (jl_all_tls_states[0]->root_task == task) { + jl_ptls_t ptls0 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; + if (ptls0->root_task == task) { // See jl_init_root_task(). The root task of the main thread // has its buffer enlarged by an artificial 3000000 bytes, but // that means that the start of the buffer usually points to @@ -372,7 +373,8 @@ JL_DLLEXPORT void jl_active_task_stack(jl_task_t *task, else if (task->stkbuf) { *total_start = *active_start = (char*)task->stkbuf; #ifndef _OS_WINDOWS_ - if (jl_all_tls_states[0]->root_task == task) { + jl_ptls_t ptls0 = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; + if (ptls0->root_task == task) { // See jl_init_root_task(). The root task of the main thread // has its buffer enlarged by an artificial 3000000 bytes, but // that means that the start of the buffer usually points to diff --git a/src/threading.c b/src/threading.c index 2cebdb22fc0aa..581032092168c 100644 --- a/src/threading.c +++ b/src/threading.c @@ -46,12 +46,16 @@ JL_DLLEXPORT void *jl_get_ptls_states(void) return jl_current_task->ptls; } +static void jl_delete_thread(void*); + #if !defined(_OS_WINDOWS_) +static pthread_key_t jl_task_exit_key; static pthread_key_t jl_safe_restore_key; __attribute__((constructor)) void _jl_init_safe_restore(void) { pthread_key_create(&jl_safe_restore_key, NULL); + pthread_key_create(&jl_task_exit_key, jl_delete_thread); } JL_DLLEXPORT jl_jmp_buf *jl_get_safe_restore(void) @@ -124,21 +128,26 @@ static DWORD jl_safe_restore_key; BOOLEAN WINAPI DllMain(IN HINSTANCE hDllHandle, IN DWORD nReason, IN LPVOID Reserved) { + jl_task_t *ct; switch (nReason) { case DLL_PROCESS_ATTACH: jl_pgcstack_key = TlsAlloc(); assert(jl_pgcstack_key != TLS_OUT_OF_INDEXES); jl_safe_restore_key = TlsAlloc(); assert(jl_safe_restore_key != TLS_OUT_OF_INDEXES); - // Fall through - case DLL_THREAD_ATTACH: - break; - case DLL_THREAD_DETACH: break; case DLL_PROCESS_DETACH: TlsFree(jl_pgcstack_key); TlsFree(jl_safe_restore_key); break; + case DLL_THREAD_ATTACH: + // will call jl_adopt_thread lazily on-demand + break; + case DLL_THREAD_DETACH: + ct = jl_get_current_task(); + if (ct != NULL) + jl_delete_thread((void*)ct->ptls); + break; } return 1; // success } @@ -291,7 +300,8 @@ void jl_pgcstack_getkey(jl_get_pgcstack_func **f, jl_pgcstack_key_t *k) #endif static uv_mutex_t tls_lock; // controls write-access to these variables: -jl_ptls_t *jl_all_tls_states JL_GLOBALLY_ROOTED; +_Atomic(jl_ptls_t*) jl_all_tls_states JL_GLOBALLY_ROOTED; +int jl_all_tls_states_size; static uv_cond_t cond; // return calling thread's ID @@ -302,7 +312,8 @@ JL_DLLEXPORT int16_t jl_threadid(void) JL_DLLEXPORT int8_t jl_threadpoolid(int16_t tid) JL_NOTSAFEPOINT { - if (tid < 0 || tid >= jl_n_threads) + int nthreads = jl_atomic_load_acquire(&jl_n_threads); + if (tid < 0 || tid >= nthreads) jl_error("invalid tid"); int n = 0; for (int i = 0; i < jl_n_threadpools; i++) { @@ -310,14 +321,25 @@ JL_DLLEXPORT int8_t jl_threadpoolid(int16_t tid) JL_NOTSAFEPOINT if (tid < n) return (int8_t)i; } - jl_error("internal error: couldn't determine threadpool id"); + return 0; // everything else uses threadpool 0 (though does not become part of any threadpool) } jl_ptls_t jl_init_threadtls(int16_t tid) { +#ifndef _OS_WINDOWS_ + if (pthread_getspecific(jl_task_exit_key)) + abort(); +#endif + if (jl_get_pgcstack() != NULL) + abort(); jl_ptls_t ptls = (jl_ptls_t)calloc(1, sizeof(jl_tls_states_t)); +#ifndef _OS_WINDOWS_ + pthread_setspecific(jl_task_exit_key, (void*)ptls); +#endif ptls->system_id = (jl_thread_t)(uintptr_t)uv_thread_self(); ptls->rngseed = jl_rand(); + if (tid == 0) + ptls->disable_gc = 1; #ifdef _OS_WINDOWS_ if (tid == 0) { if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(), @@ -328,7 +350,6 @@ jl_ptls_t jl_init_threadtls(int16_t tid) } } #endif - ptls->tid = tid; jl_atomic_store_relaxed(&ptls->gc_state, 0); // GC unsafe // Conditionally initialize the safepoint address. See comment in // `safepoint.c` @@ -349,11 +370,80 @@ jl_ptls_t jl_init_threadtls(int16_t tid) uv_mutex_init(&ptls->sleep_lock); uv_cond_init(&ptls->wake_signal); - jl_all_tls_states[tid] = ptls; + uv_mutex_lock(&tls_lock); + jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states); + if (tid == -1) + tid = jl_atomic_load_relaxed(&jl_n_threads); + ptls->tid = tid; + if (jl_all_tls_states_size <= tid) { + int i, newsize = jl_all_tls_states_size + tid + 2; + jl_ptls_t *newpptls = (jl_ptls_t*)calloc(newsize, sizeof(jl_ptls_t)); + for (i = 0; i < jl_all_tls_states_size; i++) { + newpptls[i] = allstates[i]; + } + jl_atomic_store_release(&jl_all_tls_states, newpptls); + jl_all_tls_states_size = newsize; + jl_gc_add_quiescent(ptls, (void**)allstates, free); + allstates = newpptls; + } + allstates[tid] = ptls; + if (jl_atomic_load_relaxed(&jl_n_threads) < tid + 1) + jl_atomic_store_release(&jl_n_threads, tid + 1); + jl_fence(); + uv_mutex_unlock(&tls_lock); return ptls; } +JL_DLLEXPORT jl_gcframe_t **jl_adopt_thread(void) +{ + // initialize this thread (assign tid, create heap, set up root task) + jl_ptls_t ptls = jl_init_threadtls(-1); + void *stack_lo, *stack_hi; + jl_init_stack_limits(0, &stack_lo, &stack_hi); + + (void)jl_gc_unsafe_enter(ptls); + // warning: this changes `jl_current_task`, so be careful not to call that from this function + jl_task_t *ct = jl_init_root_task(ptls, stack_lo, stack_hi); + JL_GC_PROMISE_ROOTED(ct); + + return &ct->gcstack; +} + +static void jl_delete_thread(void *value) +{ + jl_ptls_t ptls = (jl_ptls_t)value; + // Acquire the profile write lock, to ensure we are not racing with the `kill` + // call in the profile code which will also try to look at these variables. + // We have no control over when the user calls pthread_join, so we must do + // this here by blocking. This also synchronizes our read of `current_task` + // (which is the flag we currently use to check the liveness state of a thread). +#ifdef _OS_WINDOWS_ + jl_lock_profile_wr(); +#elif defined(JL_DISABLE_LIBUNWIND) + // nothing +#elif defined(__APPLE__) + jl_lock_profile_wr(); +#else + pthread_mutex_lock(&in_signal_lock); +#endif +#ifndef _OS_WINDOWS_ + pthread_setspecific(jl_task_exit_key, NULL); +#endif + jl_atomic_store_relaxed(&ptls->current_task, NULL); // dead + jl_atomic_store_relaxed(&ptls->sleep_check_state, 2); // dead, interpreted as sleeping and unwakeable +#ifdef _OS_WINDOWS_ + jl_unlock_profile_wr(); +#elif defined(JL_DISABLE_LIBUNWIND) + // nothing +#elif defined(__APPLE__) + jl_unlock_profile_wr(); +#else + pthread_mutex_unlock(&in_signal_lock); +#endif + (void)jl_gc_safe_enter(ptls); +} + JL_DLLEXPORT jl_mutex_t jl_codegen_lock; jl_mutex_t typecache_lock; @@ -467,7 +557,6 @@ void jl_init_threading(void) uv_mutex_init(&tls_lock); uv_cond_init(&cond); - #ifdef JL_ELF_TLS_VARIANT jl_check_tls(); #endif @@ -477,8 +566,8 @@ void jl_init_threading(void) // environment variable. Set the globals `jl_n_threadpools`, `jl_n_threads` // and `jl_n_threads_per_pool`. jl_n_threadpools = 1; - jl_n_threads = JULIA_NUM_THREADS; - int16_t nthreads = jl_n_threads, nthreadsi = 0; + int16_t nthreads = JULIA_NUM_THREADS; + int16_t nthreadsi = 0; char *endptr, *endptri; if (jl_options.nthreads != 0) { // --threads specified @@ -516,26 +605,26 @@ void jl_init_threading(void) } } - jl_n_threads = nthreads + nthreadsi; - jl_n_threads_per_pool = (int *)malloc(2 * sizeof(int)); + jl_all_tls_states_size = nthreads + nthreadsi; + jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int)); jl_n_threads_per_pool[0] = nthreads; jl_n_threads_per_pool[1] = nthreadsi; -#ifndef __clang_gcanalyzer__ - jl_all_tls_states = (jl_ptls_t*)calloc(jl_n_threads, sizeof(void*)); -#endif + jl_atomic_store_release(&jl_all_tls_states, (jl_ptls_t*)calloc(jl_all_tls_states_size, sizeof(jl_ptls_t))); + jl_atomic_store_release(&jl_n_threads, jl_all_tls_states_size); } static uv_barrier_t thread_init_done; void jl_start_threads(void) { + int nthreads = jl_atomic_load_relaxed(&jl_n_threads); int cpumasksize = uv_cpumask_size(); char *cp; int i, exclusive; uv_thread_t uvtid; - if (cpumasksize < jl_n_threads) // also handles error case - cpumasksize = jl_n_threads; + if (cpumasksize < nthreads) // also handles error case + cpumasksize = nthreads; char *mask = (char*)alloca(cpumasksize); // do we have exclusive use of the machine? default is no @@ -548,7 +637,7 @@ void jl_start_threads(void) // according to a 'compact' policy // non-exclusive: no affinity settings; let the kernel move threads about if (exclusive) { - if (jl_n_threads > jl_cpu_threads()) { + if (nthreads > jl_cpu_threads()) { jl_printf(JL_STDERR, "ERROR: Too many threads requested for %s option.\n", MACHINE_EXCLUSIVE_NAME); exit(1); } @@ -559,9 +648,6 @@ void jl_start_threads(void) mask[0] = 0; } - // The analyzer doesn't know jl_n_threads doesn't change, help it - size_t nthreads = jl_n_threads; - // create threads uv_barrier_init(&thread_init_done, nthreads); diff --git a/src/threading.h b/src/threading.h index 4c6f1e19881f5..9fd63f0fd188d 100644 --- a/src/threading.h +++ b/src/threading.h @@ -12,7 +12,7 @@ extern "C" { #define PROFILE_JL_THREADING 0 -extern jl_ptls_t *jl_all_tls_states JL_GLOBALLY_ROOTED; /* thread local storage */ +extern _Atomic(jl_ptls_t*) jl_all_tls_states JL_GLOBALLY_ROOTED; /* thread local storage */ typedef struct _jl_threadarg_t { int16_t tid; diff --git a/stdlib/Distributed/test/distributed_exec.jl b/stdlib/Distributed/test/distributed_exec.jl index 9dffbe0e41994..c2c6efaa6f7e1 100644 --- a/stdlib/Distributed/test/distributed_exec.jl +++ b/stdlib/Distributed/test/distributed_exec.jl @@ -153,12 +153,12 @@ function _getenv_include_thread_unsafe() end const _env_include_thread_unsafe = _getenv_include_thread_unsafe() function include_thread_unsafe_tests() - if Threads.nthreads() > 1 + if Threads.maxthreadid() > 1 if _env_include_thread_unsafe return true end - msg = "Skipping a thread-unsafe test because `Threads.nthreads() > 1`" - @warn msg Threads.nthreads() + msg = "Skipping a thread-unsafe test because `Threads.maxthreadid() > 1`" + @warn msg Threads.maxthreadid() Test.@test_broken false return false end diff --git a/stdlib/InteractiveUtils/src/InteractiveUtils.jl b/stdlib/InteractiveUtils/src/InteractiveUtils.jl index 4621ed07ed124..8d0f23f5756ce 100644 --- a/stdlib/InteractiveUtils/src/InteractiveUtils.jl +++ b/stdlib/InteractiveUtils/src/InteractiveUtils.jl @@ -141,7 +141,7 @@ function versioninfo(io::IO=stdout; verbose::Bool=false) println(io, " WORD_SIZE: ", Sys.WORD_SIZE) println(io, " LIBM: ",Base.libm_name) println(io, " LLVM: libLLVM-",Base.libllvm_version," (", Sys.JIT, ", ", Sys.CPU_NAME, ")") - println(io, " Threads: ", Threads.nthreads(), " on ", Sys.CPU_THREADS, " virtual cores") + println(io, " Threads: ", Threads.maxthreadid(), " on ", Sys.CPU_THREADS, " virtual cores") function is_nonverbose_env(k::String) return occursin(r"^JULIA_|^DYLD_|^LD_", k) diff --git a/stdlib/LinearAlgebra/src/LinearAlgebra.jl b/stdlib/LinearAlgebra/src/LinearAlgebra.jl index d9376cbda80cc..0a0162da0b1b8 100644 --- a/stdlib/LinearAlgebra/src/LinearAlgebra.jl +++ b/stdlib/LinearAlgebra/src/LinearAlgebra.jl @@ -570,7 +570,8 @@ function versioninfo(io::IO=stdout) println(io, indent, "--> ", lib.libname, " (", interface, ")") end println(io, "Threading:") - println(io, indent, "Threads.nthreads() = ", Base.Threads.nthreads()) + println(io, indent, "Threads.threadpoolsize() = ", Threads.threadpoolsize()) + println(io, indent, "Threads.maxthreadid() = ", Base.Threads.maxthreadid()) println(io, indent, "LinearAlgebra.BLAS.get_num_threads() = ", BLAS.get_num_threads()) println(io, "Relevant environment variables:") env_var_names = [ diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index 0f94d54a9c1aa..2083d2c0dc39e 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -88,10 +88,6 @@ stored per thread. Each instruction pointer corresponds to a single line of code list of instruction pointers. Note that 6 spaces for instruction pointers per backtrace are used to store metadata and two NULL end markers. Current settings can be obtained by calling this function with no arguments, and each can be set independently using keywords or in the order `(n, delay)`. - -!!! compat "Julia 1.8" - As of Julia 1.8, this function allocates space for `n` instruction pointers per thread being profiled. - Previously this was `n` total. """ function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} = nothing, limitwarn::Bool = true) n_cur = ccall(:jl_profile_maxlen_data, Csize_t, ()) @@ -102,8 +98,7 @@ function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} end delay_cur = ccall(:jl_profile_delay_nsec, UInt64, ())/10^9 if n === nothing && delay === nothing - nthreads = Sys.iswindows() ? 1 : Threads.nthreads() # windows only profiles the main thread - return round(Int, n_cur / nthreads), delay_cur + return n_cur, delay_cur end nnew = (n === nothing) ? n_cur : n delaynew = (delay === nothing) ? delay_cur : delay @@ -111,20 +106,17 @@ function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} end function init(n::Integer, delay::Real; limitwarn::Bool = true) - nthreads = Sys.iswindows() ? 1 : Threads.nthreads() # windows only profiles the main thread sample_size_bytes = sizeof(Ptr) # == Sys.WORD_SIZE / 8 - buffer_samples = n * nthreads + buffer_samples = n buffer_size_bytes = buffer_samples * sample_size_bytes if buffer_size_bytes > 2^29 && Sys.WORD_SIZE == 32 - buffer_size_bytes_per_thread = floor(Int, 2^29 / nthreads) - buffer_samples_per_thread = floor(Int, buffer_size_bytes_per_thread / sample_size_bytes) - buffer_samples = buffer_samples_per_thread * nthreads + buffer_samples = floor(Int, 2^29 / sample_size_bytes) buffer_size_bytes = buffer_samples * sample_size_bytes - limitwarn && @warn "Requested profile buffer limited to 512MB (n = $buffer_samples_per_thread per thread) given that this system is 32-bit" + limitwarn && @warn "Requested profile buffer limited to 512MB (n = $buffer_samples) given that this system is 32-bit" end - status = ccall(:jl_profile_init, Cint, (Csize_t, UInt64), buffer_samples, round(UInt64,10^9*delay)) + status = ccall(:jl_profile_init, Cint, (Csize_t, UInt64), buffer_samples, round(UInt64, 10^9*delay)) if status == -1 - error("could not allocate space for ", n, " instruction pointers per thread being profiled ($nthreads threads, $(Base.format_bytes(buffer_size_bytes)) total)") + error("could not allocate space for ", n, " instruction pointers ($(Base.format_bytes(buffer_size_bytes)))") end end @@ -427,7 +419,7 @@ function getdict!(dict::LineInfoDict, data::Vector{UInt}) n_unique_ips = length(unique_ips) n_unique_ips == 0 && return dict iplookups = similar(unique_ips, Vector{StackFrame}) - @sync for indexes_part in Iterators.partition(eachindex(unique_ips), div(n_unique_ips, Threads.nthreads(), RoundUp)) + @sync for indexes_part in Iterators.partition(eachindex(unique_ips), div(n_unique_ips, Threads.threadpoolsize(), RoundUp)) Threads.@spawn begin for i in indexes_part iplookups[i] = _lookup_corrected(unique_ips[i]) diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl index c543298ef9e4c..882e3c88265f8 100644 --- a/stdlib/Profile/test/runtests.jl +++ b/stdlib/Profile/test/runtests.jl @@ -120,11 +120,10 @@ end @testset "setting sample count and delay in init" begin n_, delay_ = Profile.init() n_original = n_ - nthreads = Sys.iswindows() ? 1 : Threads.nthreads() sample_size_bytes = sizeof(Ptr) def_n = Sys.iswindows() && Sys.WORD_SIZE == 32 ? 1_000_000 : 10_000_000 - if Sys.WORD_SIZE == 32 && (def_n * nthreads * sample_size_bytes) > 2^29 - @test n_ * nthreads * sample_size_bytes <= 2^29 + if Sys.WORD_SIZE == 32 && (def_n * sample_size_bytes) > 2^29 + @test n_ * sample_size_bytes <= 2^29 else @test n_ == def_n end @@ -133,8 +132,8 @@ end @test delay_ == def_delay Profile.init(n=1_000_001, delay=0.0005) n_, delay_ = Profile.init() - if Sys.WORD_SIZE == 32 && (1_000_001 * nthreads * sample_size_bytes) > 2^29 - @test n_ * nthreads * sample_size_bytes <= 2^29 + if Sys.WORD_SIZE == 32 && (1_000_001 * sample_size_bytes) > 2^29 + @test n_ * sample_size_bytes <= 2^29 else @test n_ == 1_000_001 end diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl index cc76c6fcfb0c8..7063f1f87bf68 100644 --- a/test/cmdlineargs.jl +++ b/test/cmdlineargs.jl @@ -226,7 +226,7 @@ let exename = `$(Base.julia_cmd()) --startup-file=no --color=no` @test errors_not_signals(`$exename --cpu-target=invalidtarget`) # -t, --threads - code = "print(Threads.nthreads())" + code = "print(Threads.threadpoolsize())" cpu_threads = ccall(:jl_effective_threads, Int32, ()) @test string(cpu_threads) == read(`$exename --threads auto -e $code`, String) == @@ -254,7 +254,7 @@ let exename = `$(Base.julia_cmd()) --startup-file=no --color=no` # Combining --threads and --procs: --threads does propagate withenv("JULIA_NUM_THREADS" => nothing) do - code = "print(sum(remotecall_fetch(Threads.nthreads, x) for x in procs()))" + code = "print(sum(remotecall_fetch(Threads.threadpoolsize, x) for x in procs()))" @test read(`$exename -p2 -t2 -e $code`, String) == "6" end diff --git a/test/runtests.jl b/test/runtests.jl index 4c9ac1cfd869c..3227804cf7b47 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -127,7 +127,7 @@ cd(@__DIR__) do println(""" Running parallel tests with: nworkers() = $(nworkers()) - nthreads() = $(Threads.nthreads()) + nthreads() = $(Threads.threadpoolsize()) Sys.CPU_THREADS = $(Sys.CPU_THREADS) Sys.total_memory() = $(Base.format_bytes(Sys.total_memory())) Sys.free_memory() = $(Base.format_bytes(Sys.free_memory())) diff --git a/test/threads.jl b/test/threads.jl index 09e802757062b..fb684b275e864 100644 --- a/test/threads.jl +++ b/test/threads.jl @@ -124,7 +124,7 @@ end function get_nthreads(options = ``; cpus = nothing) cmd = `$(Base.julia_cmd()) --startup-file=no $(options)` - cmd = `$cmd -e "print(Threads.nthreads())"` + cmd = `$cmd -e "print(Threads.threadpoolsize())"` cmd = addenv(cmd, "JULIA_EXCLUSIVE" => "0", "JULIA_NUM_THREADS" => "auto") if cpus !== nothing cmd = setcpuaffinity(cmd, cpus) diff --git a/test/threads_exec.jl b/test/threads_exec.jl index 4bce3ebd71b41..68ba9377cf955 100644 --- a/test/threads_exec.jl +++ b/test/threads_exec.jl @@ -2,7 +2,7 @@ using Test using Base.Threads -using Base.Threads: SpinLock +using Base.Threads: SpinLock, threadpoolsize # for cfunction_closure include("testenv.jl") @@ -27,9 +27,12 @@ end # (expected test duration is about 18-180 seconds) Timer(t -> killjob("KILLING BY THREAD TEST WATCHDOG\n"), 1200) +@test Threads.threadid() == 1 +@test 1 <= threadpoolsize() <= Threads.maxthreadid() + # basic lock check -if nthreads() > 1 - let lk = Base.Threads.SpinLock() +if threadpoolsize() > 1 + let lk = SpinLock() c1 = Base.Event() c2 = Base.Event() @test trylock(lk) @@ -50,7 +53,7 @@ end # threading constructs -let a = zeros(Int, 2 * nthreads()) +let a = zeros(Int, 2 * threadpoolsize()) @threads for i = 1:length(a) @sync begin @async begin @@ -70,7 +73,7 @@ end # parallel loop with parallel atomic addition function threaded_loop(a, r, x) - counter = Threads.Atomic{Int}(min(Threads.nthreads(), length(r))) + counter = Threads.Atomic{Int}(min(threadpoolsize(), length(r))) @threads for i in r # synchronize the start given that each partition is started sequentially, # meaning that without the wait, if the loop is too fast the iteration can happen in order @@ -208,7 +211,7 @@ function threaded_gc_locked(::Type{LockT}) where LockT end threaded_gc_locked(SpinLock) -threaded_gc_locked(Threads.ReentrantLock) +threaded_gc_locked(ReentrantLock) # Issue 33159 # Make sure that a Threads.Condition can't be used without being locked, on any thread. @@ -423,7 +426,7 @@ end for T in intersect((Int32, Int64, Float32, Float64), Base.Threads.atomictypes) var = Atomic{T}() nloops = 1000 - di = nthreads() + di = threadpoolsize() @threads for i in 1:di test_atomic_cas!(var, i:di:nloops) end @@ -513,7 +516,7 @@ function test_thread_cfunction() @test cfs[1] == cf1 @test cfs[2] == cf(fs[2]) @test length(unique(cfs)) == 1000 - ok = zeros(Int, nthreads()) + ok = zeros(Int, threadpoolsize()) @threads :static for i in 1:10000 i = mod1(i, 1000) fi = fs[i] @@ -529,14 +532,14 @@ if cfunction_closure end function test_thread_range() - a = zeros(Int, nthreads()) + a = zeros(Int, threadpoolsize()) @threads for i in 1:threadid() a[i] = 1 end for i in 1:threadid() @test a[i] == 1 end - for i in (threadid() + 1):nthreads() + for i in (threadid() + 1):threadpoolsize() @test a[i] == 0 end end @@ -576,17 +579,17 @@ test_nested_loops() function test_thread_too_few_iters() x = Atomic() - a = zeros(Int, nthreads()+2) - threaded_loop(a, 1:nthreads()-1, x) - found = zeros(Bool, nthreads()+2) - for i=1:nthreads()-1 + a = zeros(Int, threadpoolsize()+2) + threaded_loop(a, 1:threadpoolsize()-1, x) + found = zeros(Bool, threadpoolsize()+2) + for i=1:threadpoolsize()-1 found[a[i]] = true end - @test x[] == nthreads()-1 + @test x[] == threadpoolsize()-1 # Next test checks that all loop iterations ran, # and were unique (via pigeon-hole principle). - @test !(false in found[1:nthreads()-1]) - @test !(true in found[nthreads():end]) + @test !(false in found[1:threadpoolsize()-1]) + @test !(true in found[threadpoolsize():end]) end test_thread_too_few_iters() @@ -728,10 +731,10 @@ function _atthreads_with_error(a, err) end a end -@test_throws CompositeException _atthreads_with_error(zeros(nthreads()), true) -let a = zeros(nthreads()) +@test_throws CompositeException _atthreads_with_error(zeros(threadpoolsize()), true) +let a = zeros(threadpoolsize()) _atthreads_with_error(a, false) - @test a == [1:nthreads();] + @test a == [1:threadpoolsize();] end # static schedule @@ -742,11 +745,11 @@ function _atthreads_static_schedule(n) end return ids end -@test _atthreads_static_schedule(nthreads()) == 1:nthreads() +@test _atthreads_static_schedule(threadpoolsize()) == 1:threadpoolsize() @test _atthreads_static_schedule(1) == [1;] @test_throws( "`@threads :static` cannot be used concurrently or nested", - @threads(for i = 1:1; _atthreads_static_schedule(nthreads()); end), + @threads(for i = 1:1; _atthreads_static_schedule(threadpoolsize()); end), ) # dynamic schedule @@ -759,35 +762,35 @@ function _atthreads_dynamic_schedule(n) end return inc[], flags end -@test _atthreads_dynamic_schedule(nthreads()) == (nthreads(), ones(nthreads())) +@test _atthreads_dynamic_schedule(threadpoolsize()) == (threadpoolsize(), ones(threadpoolsize())) @test _atthreads_dynamic_schedule(1) == (1, ones(1)) @test _atthreads_dynamic_schedule(10) == (10, ones(10)) -@test _atthreads_dynamic_schedule(nthreads() * 2) == (nthreads() * 2, ones(nthreads() * 2)) +@test _atthreads_dynamic_schedule(threadpoolsize() * 2) == (threadpoolsize() * 2, ones(threadpoolsize() * 2)) # nested dynamic schedule function _atthreads_dynamic_dynamic_schedule() inc = Threads.Atomic{Int}(0) - Threads.@threads :dynamic for _ = 1:nthreads() - Threads.@threads :dynamic for _ = 1:nthreads() + Threads.@threads :dynamic for _ = 1:threadpoolsize() + Threads.@threads :dynamic for _ = 1:threadpoolsize() Threads.atomic_add!(inc, 1) end end return inc[] end -@test _atthreads_dynamic_dynamic_schedule() == nthreads() * nthreads() +@test _atthreads_dynamic_dynamic_schedule() == threadpoolsize() * threadpoolsize() function _atthreads_static_dynamic_schedule() - ids = zeros(Int, nthreads()) + ids = zeros(Int, threadpoolsize()) inc = Threads.Atomic{Int}(0) - Threads.@threads :static for i = 1:nthreads() + Threads.@threads :static for i = 1:threadpoolsize() ids[i] = Threads.threadid() - Threads.@threads :dynamic for _ = 1:nthreads() + Threads.@threads :dynamic for _ = 1:threadpoolsize() Threads.atomic_add!(inc, 1) end end return ids, inc[] end -@test _atthreads_static_dynamic_schedule() == (1:nthreads(), nthreads() * nthreads()) +@test _atthreads_static_dynamic_schedule() == (1:threadpoolsize(), threadpoolsize() * threadpoolsize()) # errors inside @threads :dynamic function _atthreads_dynamic_with_error(a) @@ -796,7 +799,7 @@ function _atthreads_dynamic_with_error(a) end a end -@test_throws "user error in the loop body" _atthreads_dynamic_with_error(zeros(nthreads())) +@test_throws "user error in the loop body" _atthreads_dynamic_with_error(zeros(threadpoolsize())) try @macroexpand @threads(for i = 1:10, j = 1:10; end) @@ -1025,7 +1028,7 @@ function check_sync_end_race() nnotscheduled += y === :notscheduled end # Useful for tuning the test: - @debug "`check_sync_end_race` done" nthreads() ncompleted nnotscheduled nerror + @debug "`check_sync_end_race` done" threadpoolsize() ncompleted nnotscheduled nerror finally done[] = true end @@ -1039,21 +1042,21 @@ end # issue #41546, thread-safe package loading @testset "package loading" begin - ch = Channel{Bool}(nthreads()) + ch = Channel{Bool}(threadpoolsize()) barrier = Base.Event() old_act_proj = Base.ACTIVE_PROJECT[] try pushfirst!(LOAD_PATH, "@") Base.ACTIVE_PROJECT[] = joinpath(@__DIR__, "TestPkg") @sync begin - for _ in 1:nthreads() + for _ in 1:threadpoolsize() Threads.@spawn begin put!(ch, true) wait(barrier) @eval using TestPkg end end - for _ in 1:nthreads() + for _ in 1:threadpoolsize() take!(ch) end notify(barrier)