precompile: compile inlinable methods that were inferred #56156

topolarity · 2024-10-14T12:44:27Z

The code generation policy here previously assumes that inlinable code almost never ends up being invoked, but it has the unfortunate side effect that making more code inline-eligible means fewer entry points to your code end up compiled.

Intuitively I think users expect function calls that ran during pre-compilation to be compiled and available (regardless of the context they are called from), which requires us to pre-compile these

This change may not be viable due to the increase in code size, but I wanted to open it to see what the effects are.

topolarity · 2024-10-14T14:19:23Z

On my machine, this causes my sysimage to grow 150 MB → 173 MB

vchuravy · 2024-10-14T14:22:38Z

Oof.

On my machine, this causes my sysimage to grow 150 MB → 173 MB

Could we still throw out some of the smallest functions? E.f. have an inlining cost threshold that allows medium size function to be cached, but not tiny ones that mostly forward things?

The code generation policy here previously assumes that inlinable code almost never ends up being `invoke`d, but it has the unfortunate side effect that making more code inline-eligible means fewer entry points to your code end up compiled. Intuitively I think users expect function calls that ran during pre-compilation to be compiled and available (regardless of the context they are called from), which requires us to pre-compile these This change may not be viable due to the increase in code size, but I wanted to open it to see what the effects are.

aviatesk · 2024-10-16T09:13:36Z

I thought it might be better to leverage some runtime behavior of the program for this issue, so I tried the following approach:

diff --git a/base/compiler/typeinfer.jl b/base/compiler/typeinfer.jl
index 8b85f7c6f3..216d51426e 100644
--- a/base/compiler/typeinfer.jl
+++ b/base/compiler/typeinfer.jl
@@ -1036,6 +1036,14 @@ end
 _uncompressed_ir(codeinst::CodeInstance, s::String) =
     ccall(:jl_uncompress_ir, Ref{CodeInfo}, (Any, Any, Any), codeinst.def.def::Method, codeinst, s)
 
+function force_compile_inlineable(codeinst::CodeInstance)
+    inferred = @atomic :monotonic codeinst.inferred
+    if inferred isa MaybeCompressed && is_inlineable(inferred)
+        ccall(:jl_force_compile_codeinst, Cvoid, (Any,), codeinst)
+    end
+    nothing
+end
+
 # compute (and cache) an inferred AST and return type
 function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mode::UInt8)
     start_time = ccall(:jl_typeinf_timing_begin, UInt64, ())
@@ -1048,6 +1056,7 @@ function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mod
                 return code
             end
             if ci_meets_requirement(code, source_mode)
+                force_compile_inlineable(code)
                 ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
                 return code
             end
@@ -1075,6 +1084,7 @@ function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mod
                 return code
             end
             if ci_meets_requirement(code, source_mode)
+                force_compile_inlineable(code)
                 engine_reject(interp, ci)
                 ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
                 return code
@@ -1112,6 +1122,7 @@ function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mod
         finish!(interp, frame; can_discard_trees) # redo finish! with the correct can_discard_trees parameter value
         @assert ci_meets_requirement(ci, source_mode)
     end
+    force_compile_inlineable(ci)
     return ci
 end
 
diff --git a/src/gf.c b/src/gf.c
index fc2e62ebff..29b1d3c36c 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -581,6 +581,10 @@ JL_DLLEXPORT void jl_update_codeinst(
     jl_atomic_store_relaxed(&codeinst->max_world, max_world); // since the edges shouldn't change after jl_fill_codeinst
 }
 
+JL_DLLEXPORT void jl_force_compile_codeinst(jl_code_instance_t *codeinst) {
+    jl_atomic_store_relaxed(&codeinst->precompile, 1);
+}
+
 JL_DLLEXPORT void jl_fill_codeinst(
         jl_code_instance_t *codeinst,
         jl_value_t *rettype, jl_value_t *exctype,
diff --git a/src/precompile_utils.c b/src/precompile_utils.c
index b361bbebc5..183e29d206 100644
--- a/src/precompile_utils.c
+++ b/src/precompile_utils.c
@@ -188,7 +188,7 @@ static int precompile_enq_specialization_(jl_method_instance_t *mi, void *closur
         else if (jl_atomic_load_relaxed(&codeinst->invoke) != jl_fptr_const_return) {
             jl_value_t *inferred = jl_atomic_load_relaxed(&codeinst->inferred);
             if (inferred && inferred != jl_nothing && jl_options.compile_enabled != JL_OPTIONS_COMPILE_ALL &&
-                (jl_options.trim == JL_TRIM_NO || jl_ir_inlining_cost(inferred) == UINT16_MAX)) {
+                ((jl_ir_inlining_cost(inferred) == UINT16_MAX) || jl_atomic_load_relaxed(&codeinst->precompile))) {
                 do_compile = 1;
             }
             else if (jl_atomic_load_relaxed(&codeinst->invoke) != NULL || jl_atomic_load_relaxed(&codeinst->precompile)) {

The basic idea is that even if it is inlineable, I thought it might be more efficient to do_compile it if it is called via dynamic dispatch.

However, this approach is incomplete because if a code instance compiled from a type-stable caller context first is later called from another dynamic dispatch context, we would need a mechanism to re-enqueue that code instance via precompile_enq_specialization_ with overriding the cache's ->precompile on the cache retrieval.

vtjnash · 2024-11-18T22:00:30Z

src/precompile_utils.c

-                inferred != jl_nothing &&
-                (jl_options.compile_enabled != JL_OPTIONS_COMPILE_ALL && jl_ir_inlining_cost(inferred) == UINT16_MAX)) {
+            if (inferred && inferred != jl_nothing && jl_options.compile_enabled != JL_OPTIONS_COMPILE_ALL &&
+                (jl_options.trim == JL_TRIM_NO || jl_ir_inlining_cost(inferred) == UINT16_MAX)) {
                do_compile = 1;
            }
            else if (jl_atomic_load_relaxed(&codeinst->invoke) != NULL || jl_atomic_load_relaxed(&codeinst->precompile)) {


Intuitively I think users expect function calls that ran during pre-compilation to be compiled and available

This line should already do exactly that

topolarity added the backport 1.11 Change should be backported to release-1.11 label Oct 14, 2024

topolarity force-pushed the ct/precompile-inlinable branch from 1c706b2 to f311c6a Compare October 14, 2024 12:50

topolarity force-pushed the ct/precompile-inlinable branch from f311c6a to cdb197e Compare October 14, 2024 14:57

KristofferC mentioned this pull request Oct 18, 2024

Backports for 1.11.2 #56228

Merged

43 tasks

vtjnash reviewed Nov 18, 2024

View reviewed changes

vtjnash closed this Nov 18, 2024

vtjnash deleted the ct/precompile-inlinable branch November 18, 2024 22:00

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

precompile: compile inlinable methods that were inferred #56156

precompile: compile inlinable methods that were inferred #56156

topolarity commented Oct 14, 2024

topolarity commented Oct 14, 2024

vchuravy commented Oct 14, 2024

aviatesk commented Oct 16, 2024

vtjnash Nov 18, 2024

precompile: compile inlinable methods that were inferred #56156

precompile: compile inlinable methods that were inferred #56156

Conversation

topolarity commented Oct 14, 2024

topolarity commented Oct 14, 2024

vchuravy commented Oct 14, 2024

aviatesk commented Oct 16, 2024

vtjnash Nov 18, 2024

Choose a reason for hiding this comment