Merge 61d8d44 into 86ee57c

JuliaLang · Apr 25, 2020 · 8ce701c · 8ce701c
2 parents 86ee57c + 61d8d44
commit 8ce701c
Show file tree

Hide file tree

Showing 24 changed files with 526 additions and 311 deletions.
diff --git a/doc/src/devdocs/locks.md b/doc/src/devdocs/locks.md
@@ -33,9 +33,6 @@ The following are definitely leaf locks (level 1), and must not try to acquire a
 The following is a leaf lock (level 2), and only acquires level 1 locks (safepoint) internally:
 
 >   * typecache
-
-The following is a level 2 lock:
-
 >   * Module->lock
 
 The following is a level 3 lock, which can only acquire level 1 or level 2 locks internally:
@@ -48,9 +45,10 @@ The following is a level 4 lock, which can only recurse to acquire level 1, 2, o
 
 No Julia code may be called while holding a lock above this point.
 
-The following is a level 6 lock, which can only recurse to acquire locks at lower levels:
+The following are a level 6 lock, which can only recurse to acquire locks at lower levels:
 
 >   * codegen
+>   * jl_modules_mutex
 
 The following is an almost root lock (level end-1), meaning only the root look may be held when
 trying to acquire it:
@@ -94,6 +92,19 @@ The following locks are broken:
     >
     > fix: create it
 
+  * Module->lock
+
+    > This is vulnerable to deadlocks since it can't be certain it is acquired in sequence.
+    > Some operations (such as `import_module`) are missing a lock.
+    >
+    > fix: replace with `jl_modules_mutex`?
+
+  * loading.jl: `require` and `register_root_module`
+
+    > This file potentially has numerous problems.
+    >
+    > fix: needs locks
+
 ## Shared Global Data Structures
 
 These data structures each need locks due to being shared mutable global state. It is the inverse

diff --git a/doc/src/manual/parallel-computing.md b/doc/src/manual/parallel-computing.md
@@ -283,6 +283,55 @@ julia> Threads.threadid()
     three processes have 2 threads enabled. For more fine grained control over worker
     threads use [`addprocs`](@ref) and pass `-t`/`--threads` as `exeflags`.
 
+## Data-race freedom
+
+You are entirely responsible for ensuring that your program is data-race free,
+and nothing promised here can be assumed if you do not observe that
+requirement. The results may be highly unintuitive.
+
+The best way to ensure this is to acquire a lock around any access to data that
+can be observed from multiple threads. For example, in most cases you should
+use the following code pattern:
+
+```julia-repl
+julia> lock(a) do
+           use(a)
+       end
+
+julia> begin
+           lock(a)
+           try
+               use(a)
+           finally
+               unlock(a)
+           end
+       end
+```
+
+Additionally, Julia automatically will ensure memory safety in the presence of
+some types of data-races.
+
+When publishing a boxed value to a global global, that value can be read on
+another thread without additional synchronization. However, do not assume that
+observing one value is set transitively implies anything about other values,
+even other globals!
+
+```julia
+Thread 1:
+global b = false
+global a = rand()
+global b = true
+
+Thread 2:
+while !b; end
+bad(a) # it is NOT safe to access `a` here!
+
+Thread 3:
+while !@isdefined(a); end
+use(a) # it IS safe to access `a` here
+```
+
+
 ## The `@threads` Macro
 
 Let's work a simple example using our native threads. Let us create an array of zeros:

diff --git a/src/array.c b/src/array.c
@@ -542,10 +542,9 @@ JL_DLLEXPORT jl_value_t *jl_ptrarrayref(jl_array_t *a JL_PROPAGATES_ROOT, size_t
 {
     assert(i < jl_array_len(a));
     assert(a->flags.ptrarray);
-    jl_value_t *elt = ((jl_value_t**)a->data)[i];
-    if (elt == NULL) {
+    jl_value_t *elt = jl_atomic_load_relaxed(((jl_value_t**)a->data) + i);
+    if (elt == NULL)
         jl_throw(jl_undefref_exception);
-    }
     return elt;
 }
 
@@ -569,7 +568,7 @@ JL_DLLEXPORT jl_value_t *jl_arrayref(jl_array_t *a, size_t i)
 JL_DLLEXPORT int jl_array_isassigned(jl_array_t *a, size_t i)
 {
     if (a->flags.ptrarray) {
-        return ((jl_value_t**)jl_array_data(a))[i] != NULL;
+        return jl_atomic_load_relaxed(((jl_value_t**)jl_array_data(a)) + i) != NULL;
     }
     else if (a->flags.hasptr) {
          jl_datatype_t *eltype = (jl_datatype_t*)jl_tparam0(jl_typeof(a));
@@ -600,12 +599,14 @@ JL_DLLEXPORT void jl_arrayset(jl_array_t *a JL_ROOTING_ARGUMENT, jl_value_t *rhs
             if (jl_is_datatype_singleton((jl_datatype_t*)jl_typeof(rhs)))
                 return;
         }
+        if (a->flags.hasptr)
+            jl_fence_release();
         jl_assign_bits(&((char*)a->data)[i * a->elsize], rhs);
         if (a->flags.hasptr)
             jl_gc_multi_wb(jl_array_owner(a), rhs);
     }
     else {
-        ((jl_value_t**)a->data)[i] = rhs;
+        jl_atomic_store_release(((jl_value_t**)a->data) + i, rhs);
         jl_gc_wb(jl_array_owner(a), rhs);
     }
 }
@@ -615,7 +616,7 @@ JL_DLLEXPORT void jl_arrayunset(jl_array_t *a, size_t i)
     if (i >= jl_array_len(a))
         jl_bounds_error_int((jl_value_t*)a, i + 1);
     if (a->flags.ptrarray)
-        ((jl_value_t**)a->data)[i] = NULL;
+        jl_atomic_store_release(((jl_value_t**)a->data) + i, NULL);
     else if (a->flags.hasptr) {
         size_t elsize = a->elsize;
         jl_assume(elsize >= sizeof(void*) && elsize % sizeof(void*) == 0);
@@ -1200,6 +1201,7 @@ JL_DLLEXPORT void jl_array_ptr_copy(jl_array_t *dest, void **dest_p,
 {
     assert(dest->flags.ptrarray && src->flags.ptrarray);
     jl_value_t *owner = jl_array_owner(dest);
+    jl_fence_release(); // ensure contents of src are visible on other processors
     // Destination is old and doesn't refer to any young object
     if (__unlikely(jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED)) {
         jl_value_t *src_owner = jl_array_owner(src);
@@ -1219,6 +1221,7 @@ JL_DLLEXPORT void jl_array_ptr_copy(jl_array_t *dest, void **dest_p,
         }
     }
     memmove(dest_p, src_p, n * sizeof(void*));
+    jl_fence_release(); // to finish up, ensure contents of dest is now visible on other processors too
 }
 
 JL_DLLEXPORT void jl_array_ptr_1d_push(jl_array_t *a, jl_value_t *item)

diff --git a/src/atomics.h b/src/atomics.h
@@ -44,6 +44,8 @@
  *        specified.
  */
 #if defined(__GNUC__)
+#  define jl_fence() __atomic_thread_fence(__ATOMIC_SEQ_CST)
+#  define jl_fence_release() __atomic_thread_fence(__ATOMIC_RELEASE)
 #  define jl_signal_fence() __atomic_signal_fence(__ATOMIC_SEQ_CST)
 #  define jl_atomic_fetch_add_relaxed(obj, arg)         \
     __atomic_fetch_add(obj, arg, __ATOMIC_RELAXED)
@@ -68,13 +70,13 @@
     __sync_bool_compare_and_swap(obj, expected, desired)
 #  define jl_atomic_exchange(obj, desired)              \
     __atomic_exchange_n(obj, desired, __ATOMIC_SEQ_CST)
-#  define jl_atomic_exchange_generic(obj, desired, orig)\
-    __atomic_exchange(obj, desired, orig, __ATOMIC_SEQ_CST)
 #  define jl_atomic_exchange_relaxed(obj, desired)      \
     __atomic_exchange_n(obj, desired, __ATOMIC_RELAXED)
 // TODO: Maybe add jl_atomic_compare_exchange_weak for spin lock
 #  define jl_atomic_store(obj, val)                     \
     __atomic_store_n(obj, val, __ATOMIC_SEQ_CST)
+#  define jl_atomic_store_relaxed(obj, val)                     \
+    __atomic_store_n(obj, val, __ATOMIC_RELAXED)
 #  if defined(__clang__) || defined(__ICC) || defined(__INTEL_COMPILER) || \
     !(defined(_CPU_X86_) || defined(_CPU_X86_64_))
 // ICC and Clang doesn't have this bug...
@@ -96,6 +98,9 @@
 #  define jl_atomic_load_relaxed(obj)           \
     __atomic_load_n(obj, __ATOMIC_RELAXED)
 #elif defined(_COMPILER_MICROSOFT_)
+// TODO: these only define compiler barriers, and aren't correct outside of x86
+#  define jl_fence() _ReadWriteBarrier()
+#  define jl_fence_release() _WriteBarrier()
 #  define jl_signal_fence() _ReadWriteBarrier()
 
 // add
@@ -123,7 +128,6 @@ jl_atomic_fetch_add(T *obj, T2 arg)
 {
     return (T)_InterlockedExchangeAdd64((volatile __int64*)obj, (__int64)arg);
 }
-// TODO: jl_atomic_exchange_generic
 #define jl_atomic_fetch_add_relaxed(obj, arg) jl_atomic_fetch_add(obj, arg)
 
 // and
@@ -267,6 +271,12 @@ static inline void jl_atomic_store_release(volatile T *obj, T2 val)
     jl_signal_fence();
     *obj = (T)val;
 }
+template<typename T, typename T2>
+static inline void jl_atomic_store_relaxed(volatile T *obj, T2 val)
+{
+    jl_signal_fence();
+    *obj = (T)val;
+}
 // atomic loads
 template<typename T>
 static inline T jl_atomic_load(volatile T *obj)
@@ -286,4 +296,41 @@ static inline T jl_atomic_load_acquire(volatile T *obj)
 #  error "No atomic operations supported."
 #endif
 
+#ifdef __clang_analyzer__
+// for the purposes of the analyzer, we can turn these into non-atomic expressions with similar properties
+
+#undef jl_atomic_exchange
+#undef jl_atomic_exchange_relaxed
+#define jl_atomic_exchange(obj, desired) \
+    (__extension__({ \
+            __typeof__((obj)) p = (obj); \
+            __typeof__(*p) temp = *p; \
+            *p = desired; \
+            temp; \
+        }))
+#define jl_atomic_exchange_relaxed jl_atomic_exchange
+
+#undef jl_atomic_compare_exchange
+#define jl_atomic_compare_exchange(obj, expected, desired) ((expected), jl_atomic_exchange((obj), (desired)))
+
+#undef jl_atomic_bool_compare_exchange
+#define jl_atomic_bool_compare_exchange(obj, expected, desired) ((expected) == jl_atomic_exchange((obj), (desired)))
+
+#undef jl_atomic_store
+#undef jl_atomic_store_release
+#undef jl_atomic_store_relaxed
+#define jl_atomic_store(obj, val)         (*(obj) = (val))
+#define jl_atomic_store_release(obj, val) (*(obj) = (val))
+#define jl_atomic_store_relaxed(obj, val) (*(obj) = (val))
+
+#undef jl_atomic_load
+#undef jl_atomic_load_acquire
+#undef jl_atomic_load_relaxed
+#define jl_atomic_load(obj)         (*(obj))
+#define jl_atomic_load_acquire(obj) (*(obj))
+#define jl_atomic_load_relaxed(obj) (*(obj))
+
+#endif
+
+
 #endif // JL_ATOMICS_H
diff --git a/src/builtins.c b/src/builtins.c
@@ -479,6 +479,7 @@ void STATIC_INLINE _grow_to(jl_value_t **root, jl_value_t ***oldargs, jl_svec_t
     if (extra)
         // grow by an extra 50% if newalloc is still only a guess
         newalloc += oldalloc / 2 + 16;
+    JL_GC_PROMISE_ROOTED(*oldargs);
     jl_svec_t *newheap = _copy_to(newalloc, *oldargs, oldalloc);
     *root = (jl_value_t*)newheap;
     *arg_heap = newheap;
@@ -556,9 +557,13 @@ static jl_value_t *do_apply(jl_value_t *F, jl_value_t **args, uint32_t nargs, jl
     else {
         // put arguments on the heap if there are too many
         newargs = NULL;
-        n_alloc = 0;
-        assert(precount > 0); // let optimizer know this won't overflow
-        _grow_to(&roots[0], &newargs, &arg_heap, &n_alloc, precount, extra);
+        n_alloc = precount;
+        if (extra)
+            // grow by an extra 50% if newalloc is still only a guess
+            n_alloc += n_alloc / 2 + 16;
+        arg_heap = jl_alloc_svec(n_alloc);
+        roots[0] = (jl_value_t*)arg_heap;
+        newargs = jl_svec_data(arg_heap);
     }
     newargs[0] = f;
     precount -= 1;

diff --git a/src/ccall.cpp b/src/ccall.cpp
@@ -135,15 +135,15 @@ static Value *runtime_sym_lookup(
     BasicBlock *dlsym_lookup = BasicBlock::Create(jl_LLVMContext, "dlsym");
     BasicBlock *ccall_bb = BasicBlock::Create(jl_LLVMContext, "ccall");
     Constant *initnul = ConstantPointerNull::get((PointerType*)T_pvoidfunc);
-    LoadInst *llvmf_orig = irbuilder.CreateAlignedLoad(llvmgv, sizeof(void*));
+    LoadInst *llvmf_orig = irbuilder.CreateAlignedLoad(T_pvoidfunc, llvmgv, sizeof(void*));
     // This in principle needs a consume ordering so that load from
     // this pointer sees a valid value. However, this is not supported by
     // LLVM (or agreed on in the C/C++ standard FWIW) and should be
     // almost impossible to happen on every platform we support since this
     // ordering is enforced by the hardware and LLVM has to speculate an
     // invalid load from the `cglobal` but doesn't depend on the `cglobal`
     // value for this to happen.
-    // llvmf_orig->setAtomic(AtomicOrdering::Consume);
+    llvmf_orig->setAtomic(AtomicOrdering::Unordered);
     irbuilder.CreateCondBr(
             irbuilder.CreateICmpNE(llvmf_orig, initnul),
             ccall_bb,
@@ -162,7 +162,7 @@ static Value *runtime_sym_lookup(
     }
     Value *llvmf = irbuilder.CreateCall(prepare_call_in(jl_builderModule(irbuilder), jldlsym_func),
             { libname, stringConstPtr(irbuilder, f_name), libptrgv });
-    auto store = irbuilder.CreateAlignedStore(llvmf, llvmgv, sizeof(void*));
+    StoreInst *store = irbuilder.CreateAlignedStore(llvmf, llvmgv, sizeof(void*));
     store->setAtomic(AtomicOrdering::Release);
     irbuilder.CreateBr(ccall_bb);
 
@@ -231,7 +231,7 @@ static GlobalVariable *emit_plt_thunk(
     IRBuilder<> irbuilder(b0);
     Value *ptr = runtime_sym_lookup(irbuilder, funcptype, f_lib, f_name, plt, libptrgv,
                                     llvmgv, runtime_lib);
-    auto store = irbuilder.CreateAlignedStore(irbuilder.CreateBitCast(ptr, T_pvoidfunc), got, sizeof(void*));
+    StoreInst *store = irbuilder.CreateAlignedStore(irbuilder.CreateBitCast(ptr, T_pvoidfunc), got, sizeof(void*));
     store->setAtomic(AtomicOrdering::Release);
     SmallVector<Value*, 16> args;
     for (Function::arg_iterator arg = plt->arg_begin(), arg_e = plt->arg_end(); arg != arg_e; ++arg)
@@ -314,7 +314,7 @@ static Value *emit_plt(
     // consume ordering too. This is even less likely to cause issues though
     // since the only thing we do to this loaded pointer is to call it
     // immediately.
-    // got_val->setAtomic(AtomicOrdering::Consume);
+    got_val->setAtomic(AtomicOrdering::Unordered);
     return ctx.builder.CreateBitCast(got_val, funcptype);
 }
 
@@ -429,17 +429,19 @@ static Value *llvm_type_rewrite(
     Value *from;
     Value *to;
     const DataLayout &DL = jl_data_layout;
+    unsigned align = std::max(DL.getPrefTypeAlignment(target_type), DL.getPrefTypeAlignment(from_type));
     if (DL.getTypeAllocSize(target_type) >= DL.getTypeAllocSize(from_type)) {
         to = emit_static_alloca(ctx, target_type);
+        cast<AllocaInst>(to)->setAlignment(Align(align));
         from = emit_bitcast(ctx, to, from_type->getPointerTo());
     }
     else {
         from = emit_static_alloca(ctx, from_type);
+        cast<AllocaInst>(from)->setAlignment(Align(align));
         to = emit_bitcast(ctx, from, target_type->getPointerTo());
     }
-    // XXX: deal with possible alignment issues
-    ctx.builder.CreateStore(v, from);
-    return ctx.builder.CreateLoad(to);
+    ctx.builder.CreateAlignedStore(v, from, align);
+    return ctx.builder.CreateAlignedLoad(to, align);
 }
 
 // --- argument passing and scratch space utilities ---
@@ -1604,9 +1606,9 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
         Value *ptls_i16 = emit_bitcast(ctx, ctx.ptlsStates, T_pint16);
         const int tid_offset = offsetof(jl_tls_states_t, tid);
         Value *ptid = ctx.builder.CreateGEP(ptls_i16, ConstantInt::get(T_size, tid_offset / 2));
-        return mark_or_box_ccall_result(ctx,
-            tbaa_decorate(tbaa_const, ctx.builder.CreateLoad(ptid)),
-            retboxed, rt, unionall, static_rt);
+        LoadInst *tid = ctx.builder.CreateAlignedLoad(ptid, sizeof(int16_t));
+        tbaa_decorate(tbaa_const, tid);
+        return mark_or_box_ccall_result(ctx, tid, retboxed, rt, unionall, static_rt);
     }
     else if (is_libjulia_func(jl_get_current_task)) {
         assert(lrt == T_prjlvalue);
@@ -1615,9 +1617,9 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
         Value *ptls_pv = emit_bitcast(ctx, ctx.ptlsStates, T_pprjlvalue);
         const int ct_offset = offsetof(jl_tls_states_t, current_task);
         Value *pct = ctx.builder.CreateGEP(ptls_pv, ConstantInt::get(T_size, ct_offset / sizeof(void*)));
-        return mark_or_box_ccall_result(ctx,
-            tbaa_decorate(tbaa_const, ctx.builder.CreateLoad(pct)),
-            retboxed, rt, unionall, static_rt);
+        LoadInst *ct = ctx.builder.CreateAlignedLoad(pct, sizeof(void*));
+        tbaa_decorate(tbaa_const, ct);
+        return mark_or_box_ccall_result(ctx, ct, retboxed, rt, unionall, static_rt);
     }
     else if (is_libjulia_func(jl_sigatomic_begin)) {
         assert(lrt == T_void);
@@ -1626,8 +1628,7 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
         ctx.builder.CreateCall(prepare_call(gcroot_flush_func));
         Value *pdefer_sig = emit_defer_signal(ctx);
         Value *defer_sig = ctx.builder.CreateLoad(pdefer_sig);
-        defer_sig = ctx.builder.CreateAdd(defer_sig,
-                                      ConstantInt::get(T_sigatomic, 1));
+        defer_sig = ctx.builder.CreateAdd(defer_sig, ConstantInt::get(T_sigatomic, 1));
         ctx.builder.CreateStore(defer_sig, pdefer_sig);
         emit_signal_fence(ctx);
         return ghostValue(jl_nothing_type);
@@ -1689,7 +1690,9 @@ static jl_cgval_t emit_ccall(jl_codectx_t &ctx, jl_value_t **args, size_t nargs)
                     idx = ctx.builder.CreateAdd(idx, ConstantInt::get(T_size, ((jl_datatype_t*)ety)->layout->first_ptr));
                 }
                 Value *slot_addr = ctx.builder.CreateInBoundsGEP(T_prjlvalue, arrayptr, idx);
-                Value *load = tbaa_decorate(tbaa_ptrarraybuf, ctx.builder.CreateLoad(T_prjlvalue, slot_addr));
+                LoadInst *load = ctx.builder.CreateAlignedLoad(T_prjlvalue, slot_addr, sizeof(void*));
+                load->setAtomic(AtomicOrdering::Unordered);
+                tbaa_decorate(tbaa_ptrarraybuf, load);
                 Value *res = ctx.builder.CreateZExt(ctx.builder.CreateICmpNE(load, Constant::getNullValue(T_prjlvalue)), T_int32);
                 JL_GC_POP();
                 return mark_or_box_ccall_result(ctx, res, retboxed, rt, unionall, static_rt);