-
Notifications
You must be signed in to change notification settings - Fork 67
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Not all of fastmath is supported? #86
Comments
LoopVectorization assumes that it's allowed to reorder your operations, so FastMath is the default. julia> using LoopVectorization
julia> function div33!(b, a)
@avx for i = 1:100
b[i] = a[i] / 33
end
end
div33! (generic function with 1 method)
julia> a = rand(100); b = similar(a);
julia> div33!(b, a); b ≈ (a ./ 33)
true # julia> @code_llvm debuginfo=:none div33!(b, a)
define nonnull %jl_value_t addrspace(10)* @"japi1_div33!_2500"(%jl_value_t addrspace(10)*, %jl_value_t addrspace(10)**, i32) #0 {
L40:
%3 = alloca %jl_value_t addrspace(10)**, align 8
store volatile %jl_value_t addrspace(10)** %1, %jl_value_t addrspace(10)*** %3, align 8
%4 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %1, align 8
%5 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %1, i64 1
%6 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %5, align 8
%7 = addrspacecast %jl_value_t addrspace(10)* %4 to %jl_value_t addrspace(11)*
%8 = addrspacecast %jl_value_t addrspace(11)* %7 to %jl_value_t*
%9 = bitcast %jl_value_t* %8 to double**
%10 = load double*, double** %9, align 8
%11 = addrspacecast %jl_value_t addrspace(10)* %6 to %jl_value_t addrspace(11)*
%12 = addrspacecast %jl_value_t addrspace(11)* %11 to %jl_value_t*
%13 = bitcast %jl_value_t* %12 to double**
%14 = load double*, double** %13, align 8
%ptr.i = bitcast double* %14 to <8 x double>*
%res.i = load <8 x double>, <8 x double>* %ptr.i, align 8
%res.i18 = fmul fast <8 x double> %res.i, <double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08>
%ptr.i17 = bitcast double* %10 to <8 x double>*
store <8 x double> %res.i18, <8 x double>* %ptr.i17, align 8
%offsetptr.i.1 = getelementptr inbounds double, double* %14, i64 8
%ptr.i.1 = bitcast double* %offsetptr.i.1 to <8 x double>*
%res.i.1 = load <8 x double>, <8 x double>* %ptr.i.1, align 8
%res.i18.1 = fmul fast <8 x double> %res.i.1, <double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08>
%offsetptr.i16.1 = getelementptr inbounds double, double* %10, i64 8
%ptr.i17.1 = bitcast double* %offsetptr.i16.1 to <8 x double>*
store <8 x double> %res.i18.1, <8 x double>* %ptr.i17.1, align 8
%offsetptr.i.2 = getelementptr inbounds double, double* %14, i64 16
%ptr.i.2 = bitcast double* %offsetptr.i.2 to <8 x double>*
%res.i.2 = load <8 x double>, <8 x double>* %ptr.i.2, align 8
%res.i18.2 = fmul fast <8 x double> %res.i.2, <double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08>
%offsetptr.i16.2 = getelementptr inbounds double, double* %10, i64 16
%ptr.i17.2 = bitcast double* %offsetptr.i16.2 to <8 x double>*
store <8 x double> %res.i18.2, <8 x double>* %ptr.i17.2, align 8
%offsetptr.i.3 = getelementptr inbounds double, double* %14, i64 24
%ptr.i.3 = bitcast double* %offsetptr.i.3 to <8 x double>*
%res.i.3 = load <8 x double>, <8 x double>* %ptr.i.3, align 8
%res.i18.3 = fmul fast <8 x double> %res.i.3, <double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08>
%offsetptr.i16.3 = getelementptr inbounds double, double* %10, i64 24
%ptr.i17.3 = bitcast double* %offsetptr.i16.3 to <8 x double>*
store <8 x double> %res.i18.3, <8 x double>* %ptr.i17.3, align 8
%offsetptr.i.4 = getelementptr inbounds double, double* %14, i64 32
%ptr.i.4 = bitcast double* %offsetptr.i.4 to <8 x double>*
%res.i.4 = load <8 x double>, <8 x double>* %ptr.i.4, align 8
%res.i18.4 = fmul fast <8 x double> %res.i.4, <double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08>
%offsetptr.i16.4 = getelementptr inbounds double, double* %10, i64 32
%ptr.i17.4 = bitcast double* %offsetptr.i16.4 to <8 x double>*
store <8 x double> %res.i18.4, <8 x double>* %ptr.i17.4, align 8
%offsetptr.i.5 = getelementptr inbounds double, double* %14, i64 40
%ptr.i.5 = bitcast double* %offsetptr.i.5 to <8 x double>*
%res.i.5 = load <8 x double>, <8 x double>* %ptr.i.5, align 8
%res.i18.5 = fmul fast <8 x double> %res.i.5, <double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08>
%offsetptr.i16.5 = getelementptr inbounds double, double* %10, i64 40
%ptr.i17.5 = bitcast double* %offsetptr.i16.5 to <8 x double>*
store <8 x double> %res.i18.5, <8 x double>* %ptr.i17.5, align 8
%offsetptr.i.6 = getelementptr inbounds double, double* %14, i64 48
%ptr.i.6 = bitcast double* %offsetptr.i.6 to <8 x double>*
%res.i.6 = load <8 x double>, <8 x double>* %ptr.i.6, align 8
%res.i18.6 = fmul fast <8 x double> %res.i.6, <double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08>
%offsetptr.i16.6 = getelementptr inbounds double, double* %10, i64 48
%ptr.i17.6 = bitcast double* %offsetptr.i16.6 to <8 x double>*
store <8 x double> %res.i18.6, <8 x double>* %ptr.i17.6, align 8
%offsetptr.i.7 = getelementptr inbounds double, double* %14, i64 56
%ptr.i.7 = bitcast double* %offsetptr.i.7 to <8 x double>*
%res.i.7 = load <8 x double>, <8 x double>* %ptr.i.7, align 8
%res.i18.7 = fmul fast <8 x double> %res.i.7, <double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08>
%offsetptr.i16.7 = getelementptr inbounds double, double* %10, i64 56
%ptr.i17.7 = bitcast double* %offsetptr.i16.7 to <8 x double>*
store <8 x double> %res.i18.7, <8 x double>* %ptr.i17.7, align 8
%offsetptr.i.8 = getelementptr inbounds double, double* %14, i64 64
%ptr.i.8 = bitcast double* %offsetptr.i.8 to <8 x double>*
%res.i.8 = load <8 x double>, <8 x double>* %ptr.i.8, align 8
%res.i18.8 = fmul fast <8 x double> %res.i.8, <double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08>
%offsetptr.i16.8 = getelementptr inbounds double, double* %10, i64 64
%ptr.i17.8 = bitcast double* %offsetptr.i16.8 to <8 x double>*
store <8 x double> %res.i18.8, <8 x double>* %ptr.i17.8, align 8
%offsetptr.i.9 = getelementptr inbounds double, double* %14, i64 72
%ptr.i.9 = bitcast double* %offsetptr.i.9 to <8 x double>*
%res.i.9 = load <8 x double>, <8 x double>* %ptr.i.9, align 8
%res.i18.9 = fmul fast <8 x double> %res.i.9, <double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08>
%offsetptr.i16.9 = getelementptr inbounds double, double* %10, i64 72
%ptr.i17.9 = bitcast double* %offsetptr.i16.9 to <8 x double>*
store <8 x double> %res.i18.9, <8 x double>* %ptr.i17.9, align 8
%offsetptr.i.10 = getelementptr inbounds double, double* %14, i64 80
%ptr.i.10 = bitcast double* %offsetptr.i.10 to <8 x double>*
%res.i.10 = load <8 x double>, <8 x double>* %ptr.i.10, align 8
%res.i18.10 = fmul fast <8 x double> %res.i.10, <double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08>
%offsetptr.i16.10 = getelementptr inbounds double, double* %10, i64 80
%ptr.i17.10 = bitcast double* %offsetptr.i16.10 to <8 x double>*
store <8 x double> %res.i18.10, <8 x double>* %ptr.i17.10, align 8
%offsetptr.i.11 = getelementptr inbounds double, double* %14, i64 88
%ptr.i.11 = bitcast double* %offsetptr.i.11 to <8 x double>*
%res.i.11 = load <8 x double>, <8 x double>* %ptr.i.11, align 8
%res.i18.11 = fmul fast <8 x double> %res.i.11, <double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08>
%offsetptr.i16.11 = getelementptr inbounds double, double* %10, i64 88
%ptr.i17.11 = bitcast double* %offsetptr.i16.11 to <8 x double>*
store <8 x double> %res.i18.11, <8 x double>* %ptr.i17.11, align 8
%offsetptr.i12 = getelementptr inbounds double, double* %14, i64 96
%ptr.i13 = bitcast double* %offsetptr.i12 to <8 x double>*
%res.i14 = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull %ptr.i13, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x double> undef)
%res.i10 = fmul fast <8 x double> %res.i14, <double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double 0x3F9F07C1F07C1F08, double undef, double undef, double undef, double undef>
%offsetptr.i8 = getelementptr inbounds double, double* %10, i64 96
%ptr.i9 = bitcast double* %offsetptr.i8 to <8 x double>*
call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %res.i10, <8 x double>* %ptr.i9, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>)
ret %jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 139821213853376 to %jl_value_t*) to %jl_value_t addrspace(10)*)
} Notice that it uses julia> reinterpret(Float64, 0x3F9F07C1F07C1F08)
0.030303030303030304
julia> 1 / 33
0.030303030303030304 If you want to avoid FastMath, you can use function ediv33!(x, y)
@avx for i = 1:100
x[i] = evfdiv(y[i], 33)
end
end |
Ahhh, interesting. I had missed that fact and when writing some experimental code, I had remembered seeing a fastmath macro. But what's confusing is that using |
It seems that not all of the fast math primitives are supported? Besides div_fast, I also noticed that conj_fast doesn't work either, but maybe not important until
LoopVectorization
support complex arithmetic.The text was updated successfully, but these errors were encountered: