-
Notifications
You must be signed in to change notification settings - Fork 67
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
@avx
doesn't handle @inbounds
#88
Comments
Note that julia> a = rand(15);
julia> using LoopVectorization, BenchmarkTools
julia> foo(a) = @avx for i in 1:10
a[i] = 1
end
foo (generic function with 1 method)
julia> @benchmark foo($a)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 1.878 ns (0.00% GC)
median time: 1.896 ns (0.00% GC)
mean time: 1.899 ns (0.00% GC)
maximum time: 4.488 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 1000
julia> a'
1×15 LinearAlgebra.Adjoint{Float64,Array{Float64,1}}:
1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.481493 0.660123 0.615034 0.245557 0.0906999 # julia> @code_llvm debuginfo=:none foo(a)
define nonnull %jl_value_t addrspace(10)* @japi1_foo_2557(%jl_value_t addrspace(10)*, %jl_value_t addrspace(10)**, i32) #0 {
top:
%3 = alloca %jl_value_t addrspace(10)**, align 8
store volatile %jl_value_t addrspace(10)** %1, %jl_value_t addrspace(10)*** %3, align 8
%4 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %1, align 8
%5 = addrspacecast %jl_value_t addrspace(10)* %4 to %jl_value_t addrspace(11)*
%6 = addrspacecast %jl_value_t addrspace(11)* %5 to %jl_value_t*
%7 = bitcast %jl_value_t* %6 to i64*
%8 = load i64, i64* %7, align 8
%ptr.i2 = inttoptr i64 %8 to <8 x double>*
store <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, <8 x double>* %ptr.i2, align 8
%typptr.i = inttoptr i64 %8 to double*
%offsetptr.i = getelementptr inbounds double, double* %typptr.i, i64 8
%ptr.i = bitcast double* %offsetptr.i to <8 x double>*
call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 1.000000e+00, double 1.000000e+00, double undef, double undef, double undef, double undef, double undef, double undef>, <8 x double>* %ptr.i, i32 8, <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
ret %jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140026789409680 to %jl_value_t*) to %jl_value_t addrspace(10)*)
} ; julia> @code_native debuginfo=:none foo(a)
.text
movq %rsi, -8(%rsp)
movq (%rsi), %rax
movq (%rax), %rax
movabsq $.rodata.cst8, %rcx
vbroadcastsd (%rcx), %zmm0
vmovupd %zmm0, (%rax)
movb $3, %cl
kmovd %ecx, %k1
vmovupd %zmm0, 64(%rax) {%k1}
movabsq $jl_system_image_data, %rax
vzeroupper
retq So the fix here was to ignore the julia> fooinbounds(a) = @avx for i in 1:10
@inbounds a[i] = 1
end
fooinbounds (generic function with 1 method)
julia> fooinbounds(a); a'
1×15 LinearAlgebra.Adjoint{Float64,Array{Float64,1}}:
1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.481493 0.660123 0.615034 0.245557 0.0906999 # julia> @code_llvm debuginfo=:none fooinbounds(a)
define nonnull %jl_value_t addrspace(10)* @japi1_fooinbounds_2676(%jl_value_t addrspace(10)*, %jl_value_t addrspace(10)**, i32) #0 {
top:
%3 = alloca %jl_value_t addrspace(10)**, align 8
store volatile %jl_value_t addrspace(10)** %1, %jl_value_t addrspace(10)*** %3, align 8
%4 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %1, align 8
%5 = addrspacecast %jl_value_t addrspace(10)* %4 to %jl_value_t addrspace(11)*
%6 = addrspacecast %jl_value_t addrspace(11)* %5 to %jl_value_t*
%7 = bitcast %jl_value_t* %6 to i64*
%8 = load i64, i64* %7, align 8
%ptr.i2 = inttoptr i64 %8 to <8 x double>*
store <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, <8 x double>* %ptr.i2, align 8
%typptr.i = inttoptr i64 %8 to double*
%offsetptr.i = getelementptr inbounds double, double* %typptr.i, i64 8
%ptr.i = bitcast double* %offsetptr.i to <8 x double>*
call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 1.000000e+00, double 1.000000e+00, double undef, double undef, double undef, double undef, double undef, double undef>, <8 x double>* %ptr.i, i32 8, <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
ret %jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140026789409680 to %jl_value_t*) to %jl_value_t addrspace(10)*)
} ; julia> @code_native debuginfo=:none fooinbounds(a)
.text
movq %rsi, -8(%rsp)
movq (%rsi), %rax
movq (%rax), %rax
movabsq $.rodata.cst8, %rcx
vbroadcastsd (%rcx), %zmm0
vmovupd %zmm0, (%rax)
movb $3, %cl
kmovd %ecx, %k1
vmovupd %zmm0, 64(%rax) {%k1}
movabsq $jl_system_image_data, %rax
vzeroupper
retq I wonder if it'd be better if the second store were a |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
MWE:
The text was updated successfully, but these errors were encountered: