Skip to content

Commit

Permalink
Merge pull request #17178 from JuliaLang/yyc/threads/elf
Browse files Browse the repository at this point in the history
RFC: Optimize TLS access in generated code on Linux
  • Loading branch information
yuyichao authored Jun 30, 2016
2 parents a66ace6 + e2bd129 commit c23687f
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 1 deletion.
30 changes: 29 additions & 1 deletion src/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@
#include <llvm/ExecutionEngine/JITMemoryManager.h>
#include <llvm/ExecutionEngine/Interpreter.h>
#endif
#if defined(_CPU_ARM_) || defined(_CPU_AARCH64_)
#if defined(_CPU_ARM_) || defined(_CPU_AARCH64_) || \
(defined(LLVM37) && defined(JULIA_ENABLE_THREADING))
# include <llvm/IR/InlineAsm.h>
#endif
#if defined(USE_POLLY)
Expand Down Expand Up @@ -3465,6 +3466,31 @@ static void finalize_gc_frame(Function *F)
ptlsStates->setCalledFunction(getter);
ptlsStates->setAttributes(jltls_states_func->getAttributes());
}
else if (jl_tls_offset != -1) {
#ifdef LLVM37
// Replace the function call with inline assembly if we know
// how to generate it.
const char *asm_str = nullptr;
# if defined(_CPU_X86_64_)
asm_str = "movq %fs:0, $0";
# elif defined(_CPU_X86_)
asm_str = "movl %gs:0, $0";
# elif defined(_CPU_AARCH64_)
asm_str = "mrs $0, tpidr_el0";
# endif
assert(asm_str && "Cannot emit thread pointer for this architecture.");
static auto offset = ConstantInt::getSigned(T_size, jl_tls_offset);
static auto tp = InlineAsm::get(FunctionType::get(T_pint8, false),
asm_str, "=r", false);
Value *tls = CallInst::Create(tp, "thread_ptr", ptlsStates);
tls = GetElementPtrInst::Create(T_int8, tls, {offset},
"ptls_i8", ptlsStates);
tls = new BitCastInst(tls, PointerType::get(T_ppjlvalue, 0),
"ptls", ptlsStates);
ptlsStates->replaceAllUsesWith(tls);
ptlsStates->eraseFromParent();
#endif
}
#else
ptlsStates->replaceAllUsesWith(prepare_global(jltls_states_var, M));
ptlsStates->eraseFromParent();
Expand Down Expand Up @@ -5164,6 +5190,8 @@ static void init_julia_llvm_env(Module *m)
// In non-imaging mode, (i.e. the code will not be saved to disk), we
// use the address of the actual getter function directly
// (`jl_tls_states_cb` returned by `jl_get_ptls_states_getter()`)
// (Alternatively if we know how to generate the tls address directly
// we will inline the assembly, see `finalize_gc_frame(Function*)`)
// In imaging mode, we emit the function address as a load of a static
// variable to be filled (in `dump.c`) at initialization time of the sysimg.
// This way we can by pass the extra indirection in `jl_get_ptls_states`
Expand Down
1 change: 1 addition & 0 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ void _julia_init(JL_IMAGE_SEARCH rel);

void jl_set_base_ctx(char *__stk);

extern size_t jl_tls_offset;
void jl_init_threading(void);
void jl_start_threads(void);
void jl_shutdown_threading(void);
Expand Down
111 changes: 111 additions & 0 deletions src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,21 @@
#include "julia.h"
#include "julia_internal.h"

#ifdef _OS_LINUX_
# if defined(_CPU_X86_64_) || defined(_CPU_X86_)
# define JL_ELF_TLS_VARIANT 2
# define JL_ELF_TLS_INIT_SIZE 0
# endif
# if defined(_CPU_AARCH64_)
# define JL_ELF_TLS_VARIANT 1
# define JL_ELF_TLS_INIT_SIZE 16
# endif
#endif

#ifdef JL_ELF_TLS_VARIANT
# include <link.h>
#endif

#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -335,11 +350,107 @@ void ti_threadfun(void *arg)
void ti_reset_timings(void);
#endif

size_t jl_tls_offset = -1;

#ifdef JL_ELF_TLS_VARIANT
// Optimize TLS access in codegen if the TLS buffer is using a IE or LE model.
// To detect such case, we find the size of the TLS segment in the main
// executable and the TIB pointer and then see if the TLS pointer on the
// current thread is in the right range.
// This can in principle be extended to the case where the TLS buffer is
// in the shared library but is part of the static buffer but that seems harder
// to detect.
# if JL_ELF_TLS_VARIANT == 1
// In Variant 1, the static TLS buffer comes after a fixed size TIB.
// The alignment needs to be applied to the original size.
static inline size_t jl_add_tls_size(size_t orig_size, size_t size, size_t align)
{
return LLT_ALIGN(orig_size, align) + size;
}
static inline ssize_t jl_check_tls_bound(void *tp, void *ptls, size_t tls_size)
{
ssize_t offset = (char*)ptls - (char*)tp;
if (offset < JL_ELF_TLS_INIT_SIZE ||
(size_t)offset + sizeof(jl_tls_states_t) > tls_size)
return -1;
return offset;
}
# elif JL_ELF_TLS_VARIANT == 2
// In Variant 2, the static TLS buffer comes before a unknown size TIB.
// The alignment needs to be applied to the new size.
static inline size_t jl_add_tls_size(size_t orig_size, size_t size, size_t align)
{
return LLT_ALIGN(orig_size + size, align);
}
static inline ssize_t jl_check_tls_bound(void *tp, void *ptls, size_t tls_size)
{
ssize_t offset = (char*)tp - (char*)ptls;
if (offset < sizeof(jl_tls_states_t) || offset > tls_size)
return -1;
return -offset;
}
# else
# error "Unknown static TLS variant"
# endif

// Find the size of the TLS segment in the main executable
typedef struct {
size_t total_size;
} check_tls_cb_t;

static int check_tls_cb(struct dl_phdr_info *info, size_t size, void *_data)
{
check_tls_cb_t *data = (check_tls_cb_t*)_data;
const ElfW(Phdr) *phdr = info->dlpi_phdr;
unsigned phnum = info->dlpi_phnum;
size_t total_size = JL_ELF_TLS_INIT_SIZE;

for (unsigned i = 0; i < phnum; i++) {
const ElfW(Phdr) *seg = &phdr[i];
if (seg->p_type != PT_TLS)
continue;
// There should be only one TLS segment
// Variant II
total_size = jl_add_tls_size(total_size, seg->p_memsz, seg->p_align);
}
data->total_size = total_size;
// only run once (on the main executable)
return 1;
}

static void jl_check_tls(void)
{
jl_tls_states_t *ptls = jl_get_ptls_states();
check_tls_cb_t data = {0};
dl_iterate_phdr(check_tls_cb, &data);
if (data.total_size == 0)
return;
void *tp; // Thread pointer
#if defined(_CPU_X86_64_)
asm("movq %%fs:0, %0" : "=r"(tp));
#elif defined(_CPU_X86_)
asm("movl %%gs:0, %0" : "=r"(tp));
#elif defined(_CPU_AARCH64_)
asm("mrs %0, tpidr_el0" : "=r"(tp));
#else
# error "Cannot emit thread pointer for this architecture."
#endif
size_t offset = jl_check_tls_bound(tp, ptls, data.total_size);
if (offset == -1)
return;
jl_tls_offset = offset;
}
#endif

// interface to Julia; sets up to make the runtime thread-safe
void jl_init_threading(void)
{
char *cp;

#ifdef JL_ELF_TLS_VARIANT
jl_check_tls();
#endif

// how many threads available, usable
int max_threads = jl_cpu_cores();
jl_n_threads = JULIA_NUM_THREADS;
Expand Down

0 comments on commit c23687f

Please sign in to comment.