From 3f475feff54914dabc3d50ea19781b2865dd0bb1 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 28 Jun 2023 11:39:35 -0600 Subject: [PATCH 1/4] Allocation Profiler: Types for all allocations Before this PR, we were missing the types for allocations in two cases: 1. allocations from codegen 2. allocations in gc_managed_realloc_ The second one is easy: those are always used for `buffer`s, right? For the first one: this PR adds a new exported julia function, which codegen will call after every allocation, to record the allocation and set its type. --- src/gc.c | 16 +++++++++++----- src/jl_exported_funcs.inc | 1 + src/llvm-final-gc-lowering.cpp | 8 +++++--- src/llvm-late-gc-lowering.cpp | 18 ++++++++++++++++++ src/llvm-pass-helpers.cpp | 21 +++++++++++++++++++++ src/llvm-pass-helpers.h | 3 +++ 6 files changed, 59 insertions(+), 8 deletions(-) diff --git a/src/gc.c b/src/gc.c index 9e588c171a676..1e57bd43f8770 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1006,11 +1006,10 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) return jl_valueof(&v->header); } -// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code. +// External-facing version of jl_gc_big_alloc_inner, called into by LLVM-generated code. JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz) { jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz); - maybe_record_alloc_to_profile(val, sz, jl_gc_unknown_type_tag); return val; } @@ -1316,12 +1315,18 @@ STATIC_INLINE jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset return jl_valueof(v); } -// Instrumented version of jl_gc_pool_alloc_inner, called into by LLVM-generated code. +// Record an allocation, called into by LLVM-generated code. +JL_DLLEXPORT jl_value_t *jl_maybe_record_alloc_to_profile(jl_value_t* val, int osize, + jl_value_t* type) +{ + maybe_record_alloc_to_profile(val, osize, (jl_datatype_t*)type); +} + +// External-facing version of jl_gc_pool_alloc_inner, called into by LLVM-generated code. JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, int osize) { jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize); - maybe_record_alloc_to_profile(val, osize, jl_gc_unknown_type_tag); return val; } @@ -3776,7 +3781,8 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds SetLastError(last_error); #endif errno = last_errno; - maybe_record_alloc_to_profile((jl_value_t*)b, sz, jl_gc_unknown_type_tag); + // gc_managed_realloc_ is currently used exclusively for resizing array buffers. + maybe_record_alloc_to_profile((jl_value_t*)b, sz, (jl_datatype_t*)jl_buff_tag); return b; } diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc index 33b431fe12a76..71c60d856efc0 100644 --- a/src/jl_exported_funcs.inc +++ b/src/jl_exported_funcs.inc @@ -185,6 +185,7 @@ XX(jl_gc_new_weakref_th) \ XX(jl_gc_num) \ XX(jl_gc_pool_alloc) \ + XX(jl_maybe_record_alloc_to_profile) \ XX(jl_gc_queue_multiroot) \ XX(jl_gc_queue_root) \ XX(jl_gc_safepoint) \ diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp index ac7d67cddd6f3..b8285a7bd79c0 100644 --- a/src/llvm-final-gc-lowering.cpp +++ b/src/llvm-final-gc-lowering.cpp @@ -48,6 +48,7 @@ struct FinalLowerGC: private JuliaPassContext { Function *queueRootFunc; Function *poolAllocFunc; Function *bigAllocFunc; + Function *recordAllocFunc; Function *allocTypedFunc; Instruction *pgcstack; Type *T_size; @@ -253,10 +254,11 @@ bool FinalLowerGC::doInitialization(Module &M) { queueRootFunc = getOrDeclare(jl_well_known::GCQueueRoot); poolAllocFunc = getOrDeclare(jl_well_known::GCPoolAlloc); bigAllocFunc = getOrDeclare(jl_well_known::GCBigAlloc); + recordAllocFunc = getOrDeclare(jl_well_known::GCRecordAllocToProfile); allocTypedFunc = getOrDeclare(jl_well_known::GCAllocTyped); T_size = M.getDataLayout().getIntPtrType(M.getContext()); - GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, allocTypedFunc}; + GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, recordAllocFunc, allocTypedFunc}; unsigned j = 0; for (unsigned i = 0; i < sizeof(functionList) / sizeof(void*); i++) { if (!functionList[i]) @@ -272,8 +274,8 @@ bool FinalLowerGC::doInitialization(Module &M) { bool FinalLowerGC::doFinalization(Module &M) { - GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, allocTypedFunc}; - queueRootFunc = poolAllocFunc = bigAllocFunc = allocTypedFunc = nullptr; + GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, recordAllocFunc, allocTypedFunc}; + queueRootFunc = poolAllocFunc = bigAllocFunc = recordAllocFunc = allocTypedFunc = nullptr; auto used = M.getGlobalVariable("llvm.compiler.used"); if (!used) return false; diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index f1cef798224d2..5f67024cc0f44 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2404,6 +2404,24 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { store->setOrdering(AtomicOrdering::Unordered); store->setMetadata(LLVMContext::MD_tbaa, tbaa_tag); + auto recordAllocIntrinsic = getOrDeclare(jl_well_known::GCRecordAllocToProfile); + auto value = newI; + //auto record_alloc = + builder.CreateCall( + recordAllocIntrinsic, + { + value, + builder.CreateIntCast( + CI->getArgOperand(1), + allocBytesIntrinsic->getFunctionType()->getParamType(1), + false), + tag + }); + // TODO: is this needed? What is it? + //record_alloc->setOrdering(AtomicOrdering::Unordered); + //record_alloc->setMetadata(LLVMContext::MD_tbaa, tbaa_tag); + + // Replace uses of the call to `julia.gc_alloc_obj` with the call to // `julia.gc_alloc_bytes`. CI->replaceAllUsesWith(newI); diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index b006f191937f5..d87358f577ca1 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -238,6 +238,7 @@ namespace jl_intrinsics { namespace jl_well_known { static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc); static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc); + static const char *GC_RECORD_ALLOC_TO_PROFILE_NAME = XSTR(jl_maybe_record_alloc_to_profile); static const char *GC_QUEUE_ROOT_NAME = XSTR(jl_gc_queue_root); static const char *GC_ALLOC_TYPED_NAME = XSTR(jl_gc_alloc_typed); @@ -275,6 +276,26 @@ namespace jl_well_known { return addGCAllocAttributes(poolAllocFunc); }); + const WellKnownFunctionDescription GCRecordAllocToProfile( + GC_RECORD_ALLOC_TO_PROFILE_NAME, + [](Type *T_size) { + auto &ctx = T_size->getContext(); + auto T_prjlvalue = JuliaType::get_prjlvalue_ty(ctx); + auto recordAllocFunc = Function::Create( + FunctionType::get( + T_prjlvalue, + { + T_size, + Type::getInt32Ty(ctx), + T_size, + }, + false), + Function::ExternalLinkage, + GC_RECORD_ALLOC_TO_PROFILE_NAME); + recordAllocFunc->addFnAttr(Attribute::getWithAllocSizeArgs(ctx, 2, None)); + return addGCAllocAttributes(recordAllocFunc); + }); + const WellKnownFunctionDescription GCQueueRoot( GC_QUEUE_ROOT_NAME, [](Type *T_size) { diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h index 727f463dc50ef..a6cb36be282e1 100644 --- a/src/llvm-pass-helpers.h +++ b/src/llvm-pass-helpers.h @@ -147,6 +147,9 @@ namespace jl_well_known { // `jl_gc_pool_alloc`: allocates bytes. extern const WellKnownFunctionDescription GCPoolAlloc; + // `jl_maybe_record_alloc_to_profile`: records an allocation to the alloc profile. + extern const WellKnownFunctionDescription GCRecordAllocToProfile; + // `jl_gc_queue_root`: queues a GC root. extern const WellKnownFunctionDescription GCQueueRoot; From 8961c263bc6b7eee24a5a92edbe549a7acac7bcd Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 28 Jun 2023 13:57:52 -0600 Subject: [PATCH 2/4] Fixups --- src/gc.c | 2 +- src/llvm-late-gc-lowering.cpp | 2 +- src/llvm-pass-helpers.cpp | 9 ++++++--- stdlib/Profile/test/allocs.jl | 20 ++++++++++++++++++++ 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/gc.c b/src/gc.c index 1e57bd43f8770..b9befe47bcff7 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1316,7 +1316,7 @@ STATIC_INLINE jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset } // Record an allocation, called into by LLVM-generated code. -JL_DLLEXPORT jl_value_t *jl_maybe_record_alloc_to_profile(jl_value_t* val, int osize, +JL_DLLEXPORT void jl_maybe_record_alloc_to_profile(jl_value_t* val, int osize, jl_value_t* type) { maybe_record_alloc_to_profile(val, osize, (jl_datatype_t*)type); diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 5f67024cc0f44..d9020d0d8f95a 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2413,7 +2413,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { value, builder.CreateIntCast( CI->getArgOperand(1), - allocBytesIntrinsic->getFunctionType()->getParamType(1), + recordAllocIntrinsic->getFunctionType()->getParamType(1), false), tag }); diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index d87358f577ca1..a7b0286aa288d 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -281,10 +281,12 @@ namespace jl_well_known { [](Type *T_size) { auto &ctx = T_size->getContext(); auto T_prjlvalue = JuliaType::get_prjlvalue_ty(ctx); + auto T_void = Type::getVoidTy(ctx); auto recordAllocFunc = Function::Create( FunctionType::get( - T_prjlvalue, + T_void, { + //T_prjlvalue, T_size, Type::getInt32Ty(ctx), T_size, @@ -292,8 +294,9 @@ namespace jl_well_known { false), Function::ExternalLinkage, GC_RECORD_ALLOC_TO_PROFILE_NAME); - recordAllocFunc->addFnAttr(Attribute::getWithAllocSizeArgs(ctx, 2, None)); - return addGCAllocAttributes(recordAllocFunc); + // TODO: what is this? + //recordAllocFunc->addFnAttr(Attribute::getWithAllocSizeArgs(ctx, 2, None)); + return recordAllocFunc; }); const WellKnownFunctionDescription GCQueueRoot( diff --git a/stdlib/Profile/test/allocs.jl b/stdlib/Profile/test/allocs.jl index c2ec7d2f6cb54..51d9d9bad47cf 100644 --- a/stdlib/Profile/test/allocs.jl +++ b/stdlib/Profile/test/allocs.jl @@ -121,3 +121,23 @@ end @test length(prof.allocs) >= 1 @test length([a for a in prof.allocs if a.type == String]) >= 1 end + +@testset "alloc profiler catches allocs from codegen" begin + @eval begin + struct MyType x::Int; y::Int end + Base.:(+)(n::Number, x::MyType) = n + x.x + x.y + foo(a, x) = a[1] + x + wrapper(a) = foo(a, MyType(0,1)) + end + a = Any[1,2,3] + # warmup + wrapper(a) + + @eval Allocs.@profile sample_rate=1 wrapper(a) + + prof = Allocs.fetch() + Allocs.clear() + + @test length(prof.allocs) >= 1 + @test length([a for a in prof.allocs if a.type == MyType]) >= 1 +end From 03f049ea7ebb3803232165258be1753ad7dace00 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 28 Jun 2023 16:00:03 -0600 Subject: [PATCH 3/4] Fix missing type cast --- src/llvm-late-gc-lowering.cpp | 2 +- stdlib/Profile/test/allocs.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index d9020d0d8f95a..12a2cfdbd7ad2 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2415,7 +2415,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { CI->getArgOperand(1), recordAllocIntrinsic->getFunctionType()->getParamType(1), false), - tag + builder.CreatePtrToInt(tag, T_size), }); // TODO: is this needed? What is it? //record_alloc->setOrdering(AtomicOrdering::Unordered); diff --git a/stdlib/Profile/test/allocs.jl b/stdlib/Profile/test/allocs.jl index 51d9d9bad47cf..d932463370bba 100644 --- a/stdlib/Profile/test/allocs.jl +++ b/stdlib/Profile/test/allocs.jl @@ -133,7 +133,7 @@ end # warmup wrapper(a) - @eval Allocs.@profile sample_rate=1 wrapper(a) + @eval Allocs.@profile sample_rate=1 wrapper($a) prof = Allocs.fetch() Allocs.clear() From c560b275519d21b20c1d584b5073ee966ea709c5 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 28 Jun 2023 16:09:57 -0600 Subject: [PATCH 4/4] Add a test for BufferTypes --- stdlib/Profile/test/allocs.jl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/stdlib/Profile/test/allocs.jl b/stdlib/Profile/test/allocs.jl index d932463370bba..cc6e35f9e96c2 100644 --- a/stdlib/Profile/test/allocs.jl +++ b/stdlib/Profile/test/allocs.jl @@ -141,3 +141,15 @@ end @test length(prof.allocs) >= 1 @test length([a for a in prof.allocs if a.type == MyType]) >= 1 end + +@testset "alloc profiler catches allocs from buffer resize" begin + a = Int[] + Allocs.@profile sample_rate=1 for _ in 1:100; push!(a, 1); end + + prof = Allocs.fetch() + Allocs.clear() + + @test length(prof.allocs) >= 1 + @test length([a for a in prof.allocs if a.type == Profile.Allocs.BufferType]) >= 1 +end +