-
Notifications
You must be signed in to change notification settings - Fork 53
/
Copy pathoptim.jl
446 lines (377 loc) · 14.5 KB
/
optim.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
# LLVM IR optimization
function addTargetPasses!(pm, tm, triple)
add_library_info!(pm, triple)
add_transform_info!(pm, tm)
end
# Based on Julia's optimization pipeline, minus the SLP and loop vectorizers.
function addOptimizationPasses!(pm, opt_level=2)
# compare with the using Julia's optimization pipeline directly:
#ccall(:jl_add_optimization_passes, Cvoid,
# (LLVM.API.LLVMPassManagerRef, Cint, Cint),
# pm, opt_level, #=lower_intrinsics=# 0)
#return
# NOTE: LLVM 12 disabled the hoisting of common instruction
# before loop vectorization (https://reviews.llvm.org/D84108).
#
# This is re-enabled with calls to cfg_simplify here,
# to merge allocations and sometimes eliminate them,
# since AllocOpt does not handle PhiNodes.
# Enable this instruction hoisting because of this and Union benchmarks.
constant_merge!(pm)
if opt_level < 2
cpu_features!(pm)
if opt_level == 1
instruction_simplify!(pm)
end
if LLVM.version() >= v"12"
cfgsimplification!(pm; hoist_common_insts=true)
else
cfgsimplification!(pm)
end
if opt_level == 1
scalar_repl_aggregates!(pm)
instruction_combining!(pm)
early_cse!(pm)
# maybe add GVN?
# also try GVNHoist and GVNSink
end
mem_cpy_opt!(pm)
always_inliner!(pm) # Respect always_inline
lower_simdloop!(pm) # Annotate loop marked with "loopinfo" as LLVM parallel loop
return
end
propagate_julia_addrsp!(pm)
scoped_no_alias_aa!(pm)
type_based_alias_analysis!(pm)
if opt_level >= 3
basic_alias_analysis!(pm)
end
if LLVM.version() >= v"12"
cfgsimplification!(pm; hoist_common_insts=true)
else
cfgsimplification!(pm)
end
dce!(pm)
scalar_repl_aggregates!(pm)
#mem_cpy_opt!(pm)
always_inliner!(pm) # Respect always_inline
# Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard
# time merging the `alloca` for the unboxed data and the `alloca` created by
# the `alloc_opt` pass.
alloc_opt!(pm)
# consider AggressiveInstCombinePass at optlevel > 2
instruction_combining!(pm)
if LLVM.version() >= v"12"
cfgsimplification!(pm; hoist_common_insts=true)
else
cfgsimplification!(pm)
end
cpu_features!(pm)
scalar_repl_aggregates!(pm)
# SROA can duplicate PHI nodes which can block LowerSIMD
instruction_combining!(pm)
jump_threading!(pm)
correlated_value_propagation!(pm)
reassociate!(pm)
early_cse!(pm)
# Load forwarding above can expose allocations that aren't actually used
# remove those before optimizing loops.
alloc_opt!(pm)
loop_rotate!(pm)
# moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1)
# LoopRotate strips metadata from terminator, so run LowerSIMD afterwards
lower_simdloop!(pm) # Annotate loop marked with "loopinfo" as LLVM parallel loop
licm!(pm)
julia_licm!(pm)
if LLVM.version() >= v"15"
simple_loop_unswitch_legacy!(pm)
else
# XXX: simple loop unswitch is available on older versions of LLVM too,
# but using this pass instead of the old one breaks Metal.jl.
loop_unswitch!(pm)
end
licm!(pm)
julia_licm!(pm)
inductive_range_check_elimination!(pm)
# Subsequent passes not stripping metadata from terminator
instruction_simplify!(pm)
loop_idiom!(pm)
ind_var_simplify!(pm)
loop_deletion!(pm)
loop_unroll!(pm) # TODO: in Julia createSimpleLoopUnroll
# Run our own SROA on heap objects before LLVM's
alloc_opt!(pm)
# Re-run SROA after loop-unrolling (useful for small loops that operate,
# over the structure of an aggregate)
scalar_repl_aggregates!(pm)
# might not be necessary:
instruction_simplify!(pm)
gvn!(pm)
mem_cpy_opt!(pm)
sccp!(pm)
# These next two passes must come before IRCE to eliminate the bounds check in #43308
correlated_value_propagation!(pm)
dce!(pm)
inductive_range_check_elimination!(pm) # Must come between the two GVN passes
# Run instcombine after redundancy elimination to exploit opportunities
# opened up by them.
# This needs to be InstCombine instead of InstSimplify to allow
# loops over Union-typed arrays to vectorize.
instruction_combining!(pm)
jump_threading!(pm)
if opt_level >= 3
gvn!(pm) # Must come after JumpThreading and before LoopVectorize
end
dead_store_elimination!(pm)
# More dead allocation (store) deletion before loop optimization
# consider removing this:
alloc_opt!(pm)
# see if all of the constant folding has exposed more loops
# to simplification and deletion
# this helps significantly with cleaning up iteration
cfgsimplification!(pm) # See note above, don't hoist instructions before LV
loop_deletion!(pm)
instruction_combining!(pm)
loop_vectorize!(pm)
loop_load_elimination!(pm)
# Cleanup after LV pass
instruction_combining!(pm)
if LLVM.version() >= v"12"
cfgsimplification!(pm; # Aggressive CFG simplification
forward_switch_cond_to_phi=true,
convert_switch_to_lookup_table=true,
need_canonical_loop=true,
hoist_common_insts=true,
#sink_common_insts=true # FIXME: Causes assertion in llvm-late-lowering
)
else
cfgsimplification!(pm)
end
aggressive_dce!(pm)
end
function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
triple = llvm_triple(job.config.target)
tm = llvm_machine(job.config.target)
global current_job
current_job = job
@dispose pm=ModulePassManager() begin
addTargetPasses!(pm, tm, triple)
addOptimizationPasses!(pm)
run!(pm, mod)
end
# NOTE: we need to use multiple distinct pass managers to force pass ordering;
# intrinsics should never get lowered before Julia has optimized them.
# XXX: why doesn't the barrier noop pass work here?
# lower intrinsics
@dispose pm=ModulePassManager() begin
addTargetPasses!(pm, tm, triple)
if !uses_julia_runtime(job)
add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!))
end
if job.config.kernel
# GC lowering is the last pass that may introduce calls to the runtime library,
# and thus additional uses of the kernel state intrinsic.
# TODO: now that all kernel state-related passes are being run here, merge some?
add!(pm, ModulePass("AddKernelState", add_kernel_state!))
add!(pm, FunctionPass("LowerKernelState", lower_kernel_state!))
add!(pm, ModulePass("CleanupKernelState", cleanup_kernel_state!))
end
if !uses_julia_runtime(job)
# remove dead uses of ptls
aggressive_dce!(pm)
add!(pm, ModulePass("LowerPTLS", lower_ptls!))
end
if uses_julia_runtime(job)
lower_exc_handlers!(pm)
end
# the Julia GC lowering pass also has some clean-up that is required
late_lower_gc_frame!(pm)
if uses_julia_runtime(job)
final_lower_gc!(pm)
end
remove_ni!(pm)
remove_julia_addrspaces!(pm)
if uses_julia_runtime(job)
# We need these two passes and the instcombine below
# after GC lowering to let LLVM do some constant propagation on the tags.
# and remove some unnecessary write barrier checks.
gvn!(pm)
sccp!(pm)
# Remove dead use of ptls
dce!(pm)
LLVM.Interop.lower_ptls!(pm, dump_native(job))
instruction_combining!(pm)
# Clean up write barrier and ptls lowering
cfgsimplification!(pm)
end
# Julia's operand bundles confuse the inliner, so repeat here now they are gone.
# FIXME: we should fix the inliner so that inlined code gets optimized early-on
always_inliner!(pm)
# some of Julia's optimization passes happen _after_ lowering intrinsics
combine_mul_add!(pm)
div_rem_pairs!(pm)
if VERSION < v"1.10.0-DEV.1144"
# save function attributes to work around JuliaGPU/GPUCompiler#437
current_attrs = Dict{String,Any}()
for f in functions(mod)
attrs = function_attributes(f)
length(attrs) == 0 && continue
current_attrs[LLVM.name(f)] = collect(attrs)
end
end
run!(pm, mod)
if VERSION < v"1.10.0-DEV.1144"
# restore function attributes
for (fn, attrs) in current_attrs
haskey(functions(mod), fn) || continue
f = functions(mod)[fn]
for attr in attrs
# NOTE: there's no function attributes that contain a type,
# so we can just blindly add them back
push!(function_attributes(f), attr)
end
end
end
end
# target-specific optimizations
optimize_module!(job, mod)
# we compile a module containing the entire call graph,
# so perform some interprocedural optimizations.
#
# for some reason, these passes need to be distinct from the regular optimization chain,
# or certain values (such as the constant arrays used to populare llvm.compiler.user ad
# part of the LateLowerGCFrame pass) aren't collected properly.
#
# these might not always be safe, as Julia's IR metadata isn't designed for IPO.
@dispose pm=ModulePassManager() begin
addTargetPasses!(pm, tm, triple)
# simplify function calls that don't use the returned value
dead_arg_elimination!(pm)
run!(pm, mod)
end
# compare to Clang by using the pass manager builder APIs:
#LLVM.clopts("-print-after-all", "-filter-print-funcs=$(LLVM.name(entry))")
#@dispose pm=ModulePassManager() begin
# addTargetPasses!(pm, tm, triple)
# PassManager@dispose pmb=IRBuilder() begin
# optlevel!(pmb, 2)
# populate!(pm, pmb)
# end
# run!(pm, mod)
#end
return
end
## lowering intrinsics
cpu_features!(pm::PassManager) = add!(pm, ModulePass("LowerCPUFeatures", cpu_features!))
function cpu_features!(mod::LLVM.Module)
job = current_job::CompilerJob
ctx = context(mod)
changed = false
argtyps = Dict(
"f32" => Float32,
"f64" => Float64,
)
# have_fma
for f in functions(mod)
ft = function_type(f)
fn = LLVM.name(f)
startswith(fn, "julia.cpu.have_fma.") || continue
typnam = fn[20:end]
# determine whether this back-end supports FMA on this type
has_fma = if haskey(argtyps, typnam)
typ = argtyps[typnam]
have_fma(job.config.target, typ)
else
# warn?
false
end
has_fma = ConstantInt(return_type(ft), has_fma)
# substitute all uses of the intrinsic with a constant
materialized = LLVM.Value[]
for use in uses(f)
val = user(use)
replace_uses!(val, has_fma)
push!(materialized, val)
end
# remove the intrinsic and its uses
for val in materialized
@assert isempty(uses(val))
unsafe_delete!(LLVM.parent(val), val)
end
@assert isempty(uses(f))
unsafe_delete!(mod, f)
end
return changed
end
# lower object allocations to to PTX malloc
#
# this is a PoC implementation that is very simple: allocate, and never free. it also runs
# _before_ Julia's GC lowering passes, so we don't get to use the results of its analyses.
# when we ever implement a more potent GC, we will need those results, but the relevant pass
# is currently very architecture/CPU specific: hard-coded pool sizes, TLS references, etc.
# such IR is hard to clean-up, so we probably will need to have the GC lowering pass emit
# lower-level intrinsics which then can be lowered to architecture-specific code.
function lower_gc_frame!(fun::LLVM.Function)
job = current_job::CompilerJob
mod = LLVM.parent(fun)
ctx = context(fun)
changed = false
# plain alloc
if haskey(functions(mod), "julia.gc_alloc_obj")
alloc_obj = functions(mod)["julia.gc_alloc_obj"]
alloc_obj_ft = function_type(alloc_obj)
T_prjlvalue = return_type(alloc_obj_ft)
T_pjlvalue = convert(LLVMType, Any; ctx, allow_boxed=true)
for use in uses(alloc_obj)
call = user(use)::LLVM.CallInst
# decode the call
ops = arguments(call)
sz = ops[2]
# replace with PTX alloc_obj
@dispose builder=IRBuilder(ctx) begin
position!(builder, call)
ptr = call!(builder, Runtime.get(:gc_pool_alloc), [sz])
replace_uses!(call, ptr)
end
unsafe_delete!(LLVM.parent(call), call)
changed = true
end
@compiler_assert isempty(uses(alloc_obj)) job
end
# we don't care about write barriers
if haskey(functions(mod), "julia.write_barrier")
barrier = functions(mod)["julia.write_barrier"]
for use in uses(barrier)
call = user(use)::LLVM.CallInst
unsafe_delete!(LLVM.parent(call), call)
changed = true
end
@compiler_assert isempty(uses(barrier)) job
end
return changed
end
# lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
#
# this assumes and checks that the TLS is unused, which should be the case for most GPU code
# after lowering the GC intrinsics to TLS-less code and having run DCE.
#
# TODO: maybe don't have Julia emit actual uses of the TLS, but use intrinsics instead,
# making it easier to remove or reimplement that functionality here.
function lower_ptls!(mod::LLVM.Module)
job = current_job::CompilerJob
changed = false
intrinsic = VERSION >= v"1.7.0-DEV.1205" ? "julia.get_pgcstack" : "julia.ptls_states"
if haskey(functions(mod), intrinsic)
ptls_getter = functions(mod)[intrinsic]
for use in uses(ptls_getter)
val = user(use)
if isempty(uses(val))
unsafe_delete!(LLVM.parent(val), val)
changed = true
else
# the validator will detect this
end
end
end
return changed
end