Skip to content

Commit

Permalink
Merge 61d8d44 into 86ee57c
Browse files Browse the repository at this point in the history
  • Loading branch information
vtjnash authored Apr 25, 2020
2 parents 86ee57c + 61d8d44 commit 8ce701c
Show file tree
Hide file tree
Showing 24 changed files with 526 additions and 311 deletions.
19 changes: 15 additions & 4 deletions doc/src/devdocs/locks.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@ The following are definitely leaf locks (level 1), and must not try to acquire a
The following is a leaf lock (level 2), and only acquires level 1 locks (safepoint) internally:

> * typecache
The following is a level 2 lock:

> * Module->lock
The following is a level 3 lock, which can only acquire level 1 or level 2 locks internally:
Expand All @@ -48,9 +45,10 @@ The following is a level 4 lock, which can only recurse to acquire level 1, 2, o
No Julia code may be called while holding a lock above this point.

The following is a level 6 lock, which can only recurse to acquire locks at lower levels:
The following are a level 6 lock, which can only recurse to acquire locks at lower levels:

> * codegen
> * jl_modules_mutex
The following is an almost root lock (level end-1), meaning only the root look may be held when
trying to acquire it:
Expand Down Expand Up @@ -94,6 +92,19 @@ The following locks are broken:
>
> fix: create it
* Module->lock

> This is vulnerable to deadlocks since it can't be certain it is acquired in sequence.
> Some operations (such as `import_module`) are missing a lock.
>
> fix: replace with `jl_modules_mutex`?
* loading.jl: `require` and `register_root_module`

> This file potentially has numerous problems.
>
> fix: needs locks
## Shared Global Data Structures

These data structures each need locks due to being shared mutable global state. It is the inverse
Expand Down
49 changes: 49 additions & 0 deletions doc/src/manual/parallel-computing.md
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,55 @@ julia> Threads.threadid()
three processes have 2 threads enabled. For more fine grained control over worker
threads use [`addprocs`](@ref) and pass `-t`/`--threads` as `exeflags`.

## Data-race freedom

You are entirely responsible for ensuring that your program is data-race free,
and nothing promised here can be assumed if you do not observe that
requirement. The results may be highly unintuitive.

The best way to ensure this is to acquire a lock around any access to data that
can be observed from multiple threads. For example, in most cases you should
use the following code pattern:

```julia-repl
julia> lock(a) do
use(a)
end
julia> begin
lock(a)
try
use(a)
finally
unlock(a)
end
end
```

Additionally, Julia automatically will ensure memory safety in the presence of
some types of data-races.

When publishing a boxed value to a global global, that value can be read on
another thread without additional synchronization. However, do not assume that
observing one value is set transitively implies anything about other values,
even other globals!

```julia
Thread 1:
global b = false
global a = rand()
global b = true
Thread 2:
while !b; end
bad(a) # it is NOT safe to access `a` here!
Thread 3:
while !@isdefined(a); end
use(a) # it IS safe to access `a` here
```


## The `@threads` Macro

Let's work a simple example using our native threads. Let us create an array of zeros:
Expand Down
15 changes: 9 additions & 6 deletions src/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -542,10 +542,9 @@ JL_DLLEXPORT jl_value_t *jl_ptrarrayref(jl_array_t *a JL_PROPAGATES_ROOT, size_t
{
assert(i < jl_array_len(a));
assert(a->flags.ptrarray);
jl_value_t *elt = ((jl_value_t**)a->data)[i];
if (elt == NULL) {
jl_value_t *elt = jl_atomic_load_relaxed(((jl_value_t**)a->data) + i);
if (elt == NULL)
jl_throw(jl_undefref_exception);
}
return elt;
}

Expand All @@ -569,7 +568,7 @@ JL_DLLEXPORT jl_value_t *jl_arrayref(jl_array_t *a, size_t i)
JL_DLLEXPORT int jl_array_isassigned(jl_array_t *a, size_t i)
{
if (a->flags.ptrarray) {
return ((jl_value_t**)jl_array_data(a))[i] != NULL;
return jl_atomic_load_relaxed(((jl_value_t**)jl_array_data(a)) + i) != NULL;
}
else if (a->flags.hasptr) {
jl_datatype_t *eltype = (jl_datatype_t*)jl_tparam0(jl_typeof(a));
Expand Down Expand Up @@ -600,12 +599,14 @@ JL_DLLEXPORT void jl_arrayset(jl_array_t *a JL_ROOTING_ARGUMENT, jl_value_t *rhs
if (jl_is_datatype_singleton((jl_datatype_t*)jl_typeof(rhs)))
return;
}
if (a->flags.hasptr)
jl_fence_release();
jl_assign_bits(&((char*)a->data)[i * a->elsize], rhs);
if (a->flags.hasptr)
jl_gc_multi_wb(jl_array_owner(a), rhs);
}
else {
((jl_value_t**)a->data)[i] = rhs;
jl_atomic_store_release(((jl_value_t**)a->data) + i, rhs);
jl_gc_wb(jl_array_owner(a), rhs);
}
}
Expand All @@ -615,7 +616,7 @@ JL_DLLEXPORT void jl_arrayunset(jl_array_t *a, size_t i)
if (i >= jl_array_len(a))
jl_bounds_error_int((jl_value_t*)a, i + 1);
if (a->flags.ptrarray)
((jl_value_t**)a->data)[i] = NULL;
jl_atomic_store_release(((jl_value_t**)a->data) + i, NULL);
else if (a->flags.hasptr) {
size_t elsize = a->elsize;
jl_assume(elsize >= sizeof(void*) && elsize % sizeof(void*) == 0);
Expand Down Expand Up @@ -1200,6 +1201,7 @@ JL_DLLEXPORT void jl_array_ptr_copy(jl_array_t *dest, void **dest_p,
{
assert(dest->flags.ptrarray && src->flags.ptrarray);
jl_value_t *owner = jl_array_owner(dest);
jl_fence_release(); // ensure contents of src are visible on other processors
// Destination is old and doesn't refer to any young object
if (__unlikely(jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED)) {
jl_value_t *src_owner = jl_array_owner(src);
Expand All @@ -1219,6 +1221,7 @@ JL_DLLEXPORT void jl_array_ptr_copy(jl_array_t *dest, void **dest_p,
}
}
memmove(dest_p, src_p, n * sizeof(void*));
jl_fence_release(); // to finish up, ensure contents of dest is now visible on other processors too
}

JL_DLLEXPORT void jl_array_ptr_1d_push(jl_array_t *a, jl_value_t *item)
Expand Down
53 changes: 50 additions & 3 deletions src/atomics.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
* specified.
*/
#if defined(__GNUC__)
# define jl_fence() __atomic_thread_fence(__ATOMIC_SEQ_CST)
# define jl_fence_release() __atomic_thread_fence(__ATOMIC_RELEASE)
# define jl_signal_fence() __atomic_signal_fence(__ATOMIC_SEQ_CST)
# define jl_atomic_fetch_add_relaxed(obj, arg) \
__atomic_fetch_add(obj, arg, __ATOMIC_RELAXED)
Expand All @@ -68,13 +70,13 @@
__sync_bool_compare_and_swap(obj, expected, desired)
# define jl_atomic_exchange(obj, desired) \
__atomic_exchange_n(obj, desired, __ATOMIC_SEQ_CST)
# define jl_atomic_exchange_generic(obj, desired, orig)\
__atomic_exchange(obj, desired, orig, __ATOMIC_SEQ_CST)
# define jl_atomic_exchange_relaxed(obj, desired) \
__atomic_exchange_n(obj, desired, __ATOMIC_RELAXED)
// TODO: Maybe add jl_atomic_compare_exchange_weak for spin lock
# define jl_atomic_store(obj, val) \
__atomic_store_n(obj, val, __ATOMIC_SEQ_CST)
# define jl_atomic_store_relaxed(obj, val) \
__atomic_store_n(obj, val, __ATOMIC_RELAXED)
# if defined(__clang__) || defined(__ICC) || defined(__INTEL_COMPILER) || \
!(defined(_CPU_X86_) || defined(_CPU_X86_64_))
// ICC and Clang doesn't have this bug...
Expand All @@ -96,6 +98,9 @@
# define jl_atomic_load_relaxed(obj) \
__atomic_load_n(obj, __ATOMIC_RELAXED)
#elif defined(_COMPILER_MICROSOFT_)
// TODO: these only define compiler barriers, and aren't correct outside of x86
# define jl_fence() _ReadWriteBarrier()
# define jl_fence_release() _WriteBarrier()
# define jl_signal_fence() _ReadWriteBarrier()

// add
Expand Down Expand Up @@ -123,7 +128,6 @@ jl_atomic_fetch_add(T *obj, T2 arg)
{
return (T)_InterlockedExchangeAdd64((volatile __int64*)obj, (__int64)arg);
}
// TODO: jl_atomic_exchange_generic
#define jl_atomic_fetch_add_relaxed(obj, arg) jl_atomic_fetch_add(obj, arg)

// and
Expand Down Expand Up @@ -267,6 +271,12 @@ static inline void jl_atomic_store_release(volatile T *obj, T2 val)
jl_signal_fence();
*obj = (T)val;
}
template<typename T, typename T2>
static inline void jl_atomic_store_relaxed(volatile T *obj, T2 val)
{
jl_signal_fence();
*obj = (T)val;
}
// atomic loads
template<typename T>
static inline T jl_atomic_load(volatile T *obj)
Expand All @@ -286,4 +296,41 @@ static inline T jl_atomic_load_acquire(volatile T *obj)
# error "No atomic operations supported."
#endif

#ifdef __clang_analyzer__
// for the purposes of the analyzer, we can turn these into non-atomic expressions with similar properties

#undef jl_atomic_exchange
#undef jl_atomic_exchange_relaxed
#define jl_atomic_exchange(obj, desired) \
(__extension__({ \
__typeof__((obj)) p = (obj); \
__typeof__(*p) temp = *p; \
*p = desired; \
temp; \
}))
#define jl_atomic_exchange_relaxed jl_atomic_exchange

#undef jl_atomic_compare_exchange
#define jl_atomic_compare_exchange(obj, expected, desired) ((expected), jl_atomic_exchange((obj), (desired)))

#undef jl_atomic_bool_compare_exchange
#define jl_atomic_bool_compare_exchange(obj, expected, desired) ((expected) == jl_atomic_exchange((obj), (desired)))

#undef jl_atomic_store
#undef jl_atomic_store_release
#undef jl_atomic_store_relaxed
#define jl_atomic_store(obj, val) (*(obj) = (val))
#define jl_atomic_store_release(obj, val) (*(obj) = (val))
#define jl_atomic_store_relaxed(obj, val) (*(obj) = (val))

#undef jl_atomic_load
#undef jl_atomic_load_acquire
#undef jl_atomic_load_relaxed
#define jl_atomic_load(obj) (*(obj))
#define jl_atomic_load_acquire(obj) (*(obj))
#define jl_atomic_load_relaxed(obj) (*(obj))

#endif


#endif // JL_ATOMICS_H
11 changes: 8 additions & 3 deletions src/builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,7 @@ void STATIC_INLINE _grow_to(jl_value_t **root, jl_value_t ***oldargs, jl_svec_t
if (extra)
// grow by an extra 50% if newalloc is still only a guess
newalloc += oldalloc / 2 + 16;
JL_GC_PROMISE_ROOTED(*oldargs);
jl_svec_t *newheap = _copy_to(newalloc, *oldargs, oldalloc);
*root = (jl_value_t*)newheap;
*arg_heap = newheap;
Expand Down Expand Up @@ -556,9 +557,13 @@ static jl_value_t *do_apply(jl_value_t *F, jl_value_t **args, uint32_t nargs, jl
else {
// put arguments on the heap if there are too many
newargs = NULL;
n_alloc = 0;
assert(precount > 0); // let optimizer know this won't overflow
_grow_to(&roots[0], &newargs, &arg_heap, &n_alloc, precount, extra);
n_alloc = precount;
if (extra)
// grow by an extra 50% if newalloc is still only a guess
n_alloc += n_alloc / 2 + 16;
arg_heap = jl_alloc_svec(n_alloc);
roots[0] = (jl_value_t*)arg_heap;
newargs = jl_svec_data(arg_heap);
}
newargs[0] = f;
precount -= 1;
Expand Down
37 changes: 20 additions & 17 deletions src/ccall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,15 +135,15 @@ static Value *runtime_sym_lookup(
BasicBlock *dlsym_lookup = BasicBlock::Create(jl_LLVMContext, "dlsym");
BasicBlock *ccall_bb = BasicBlock::Create(jl_LLVMContext, "ccall");
Constant *initnul = ConstantPointerNull::get((PointerType*)T_pvoidfunc);
LoadInst *llvmf_orig = irbuilder.CreateAlignedLoad(llvmgv, sizeof(void*));
LoadInst *llvmf_orig = irbuilder.CreateAlignedLoad(T_pvoidfunc, llvmgv, sizeof(void*));
// This in principle needs a consume ordering so that load from
// this pointer sees a valid value. However, this is not supported by
// LLVM (or agreed on in the C/C++ standard FWIW) and should be
// almost impossible to happen on every platform we support since this
// ordering is enforced by the hardware and LLVM has to speculate an
// invalid load from the `cglobal` but doesn't depend on the `cglobal`
// value for this to happen.
// llvmf_orig->setAtomic(AtomicOrdering::Consume);
llvmf_orig->setAtomic(AtomicOrdering::Unordered);
irbuilder.CreateCondBr(
irbuilder.CreateICmpNE(llvmf_orig, initnul),
ccall_bb,
Expand All @@ -162,7 +162,7 @@ static Value *runtime_sym_lookup(
}
Value *llvmf = irbuilder.CreateCall(prepare_call_in(jl_builderModule(irbuilder), jldlsym_func),
{ libname, stringConstPtr(irbuilder, f_name), libptrgv });
auto store = irbuilder.CreateAlignedStore(llvmf, llvmgv, sizeof(void*));
StoreInst *store = irbuilder.CreateAlignedStore(llvmf, llvmgv, sizeof(void*));
store->setAtomic(AtomicOrdering::Release);
irbuilder.CreateBr(ccall_bb);

Expand Down Expand Up @@ -231,7 +231,7 @@ static GlobalVariable *emit_plt_thunk(
IRBuilder<> irbuilder(b0);
Value *ptr = runtime_sym_lookup(irbuilder, funcptype, f_lib, f_name, plt, libptrgv,
llvmgv, runtime_lib);
auto store = irbuilder.CreateAlignedStore(irbuilder.CreateBitCast(ptr, T_pvoidfunc), got, sizeof(void*));
StoreInst *store = irbuilder.CreateAlignedStore(irbuilder.CreateBitCast(ptr, T_pvoidfunc), got, sizeof(void*));
store->setAtomic(AtomicOrdering::Release);
SmallVector<Value*, 16> args;
for (Function::arg_iterator arg = plt->arg_begin(), arg_e = plt->arg_end(); arg != arg_e; ++arg)
Expand Down Expand Up @@ -314,7 +314,7 @@ static Value *emit_plt(
// consume ordering too. This is even less likely to cause issues though
// since the only thing we do to this loaded pointer is to call it
// immediately.
// got_val->setAtomic(AtomicOrdering::Consume);
got_val->setAtomic(AtomicOrdering::Unordered);
return ctx.builder.CreateBitCast(got_val, funcptype);
}

Expand Down Expand Up @@ -429,17 +429,19 @@ static Value *llvm_type_rewrite(
Value *from;
Value *to;
const DataLayout &DL = jl_data_layout;
unsigned align = std::max(DL.getPrefTypeAlignment(target_type), DL.getPrefTypeAlignment(from_type));
if (DL.getTypeAllocSize(target_type) >= DL.getTypeAllocSize(from_type)) {
to = emit_static_alloca(ctx, target_type);
cast<AllocaInst>(to)->setAlignment(Align(align));
from = emit_bitcast(ctx, to, from_type->getPointerTo());
}
else {
from = emit_static_alloca(ctx, from_type);
cast<AllocaInst>(from)->setAlignment(Align(align));
to = emit_bitcast(ctx, from, target_type->getPointerTo());
}
// XXX: deal with possible alignment issues
ctx.builder.CreateStore(v, from);
return ctx.builder.CreateLoad(to);
ctx.builder.CreateAlignedStore(v, from, align);
return ctx.builder.CreateAlignedLoad(to, align);
}

// --- argument passing and scratch space utilities ---
Expand Down Expand Up @@ -1604,9 +1606,9 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
Value *ptls_i16 = emit_bitcast(ctx, ctx.ptlsStates, T_pint16);
const int tid_offset = offsetof(jl_tls_states_t, tid);
Value *ptid = ctx.builder.CreateGEP(ptls_i16, ConstantInt::get(T_size, tid_offset / 2));
return mark_or_box_ccall_result(ctx,
tbaa_decorate(tbaa_const, ctx.builder.CreateLoad(ptid)),
retboxed, rt, unionall, static_rt);
LoadInst *tid = ctx.builder.CreateAlignedLoad(ptid, sizeof(int16_t));
tbaa_decorate(tbaa_const, tid);
return mark_or_box_ccall_result(ctx, tid, retboxed, rt, unionall, static_rt);
}
else if (is_libjulia_func(jl_get_current_task)) {
assert(lrt == T_prjlvalue);
Expand All @@ -1615,9 +1617,9 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
Value *ptls_pv = emit_bitcast(ctx, ctx.ptlsStates, T_pprjlvalue);
const int ct_offset = offsetof(jl_tls_states_t, current_task);
Value *pct = ctx.builder.CreateGEP(ptls_pv, ConstantInt::get(T_size, ct_offset / sizeof(void*)));
return mark_or_box_ccall_result(ctx,
tbaa_decorate(tbaa_const, ctx.builder.CreateLoad(pct)),
retboxed, rt, unionall, static_rt);
LoadInst *ct = ctx.builder.CreateAlignedLoad(pct, sizeof(void*));
tbaa_decorate(tbaa_const, ct);
return mark_or_box_ccall_result(ctx, ct, retboxed, rt, unionall, static_rt);
}
else if (is_libjulia_func(jl_sigatomic_begin)) {
assert(lrt == T_void);
Expand All @@ -1626,8 +1628,7 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
ctx.builder.CreateCall(prepare_call(gcroot_flush_func));
Value *pdefer_sig = emit_defer_signal(ctx);
Value *defer_sig = ctx.builder.CreateLoad(pdefer_sig);
defer_sig = ctx.builder.CreateAdd(defer_sig,
ConstantInt::get(T_sigatomic, 1));
defer_sig = ctx.builder.CreateAdd(defer_sig, ConstantInt::get(T_sigatomic, 1));
ctx.builder.CreateStore(defer_sig, pdefer_sig);
emit_signal_fence(ctx);
return ghostValue(jl_nothing_type);
Expand Down Expand Up @@ -1689,7 +1690,9 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
idx = ctx.builder.CreateAdd(idx, ConstantInt::get(T_size, ((jl_datatype_t*)ety)->layout->first_ptr));
}
Value *slot_addr = ctx.builder.CreateInBoundsGEP(T_prjlvalue, arrayptr, idx);
Value *load = tbaa_decorate(tbaa_ptrarraybuf, ctx.builder.CreateLoad(T_prjlvalue, slot_addr));
LoadInst *load = ctx.builder.CreateAlignedLoad(T_prjlvalue, slot_addr, sizeof(void*));
load->setAtomic(AtomicOrdering::Unordered);
tbaa_decorate(tbaa_ptrarraybuf, load);
Value *res = ctx.builder.CreateZExt(ctx.builder.CreateICmpNE(load, Constant::getNullValue(T_prjlvalue)), T_int32);
JL_GC_POP();
return mark_or_box_ccall_result(ctx, res, retboxed, rt, unionall, static_rt);
Expand Down
Loading

0 comments on commit 8ce701c

Please sign in to comment.