Skip to content

Commit

Permalink
Don't aggregate homogeneous floats in the Rust ABI
Browse files Browse the repository at this point in the history
  • Loading branch information
Urgau committed Feb 21, 2022
1 parent 2e374cf commit 7b69d21
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 2 deletions.
22 changes: 20 additions & 2 deletions compiler/rustc_middle/src/ty/layout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ use rustc_session::{config::OptLevel, DataTypeKind, FieldInfo, SizeKind, Variant
use rustc_span::symbol::Symbol;
use rustc_span::{Span, DUMMY_SP};
use rustc_target::abi::call::{
ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, PassMode, Reg, RegKind,
ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, HomogeneousAggregate, PassMode,
Reg, RegKind,
};
use rustc_target::abi::*;
use rustc_target::spec::{abi::Abi as SpecAbi, HasTargetSpec, PanicStrategy, Target};
Expand Down Expand Up @@ -3203,10 +3204,27 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {

if arg.layout.is_unsized() || size > max_by_val_size {
arg.make_indirect();
} else if let Ok(HomogeneousAggregate::Homogeneous(Reg {
kind: RegKind::Float,
..
})) = arg.layout.homogeneous_aggregate(self)
{
// We don't want to aggregate floats as an aggregates of Integer
// because this will hurt the generated assembly (#93490)
//
// As an optimization we want to pass homogeneous aggregate of floats
// greater than pointer size as indirect
if size > Pointer.size(self) {
arg.make_indirect();
}
} else {
// We want to pass small aggregates as immediates, but using
// a LLVM aggregate type for this leads to bad optimizations,
// so we pick an appropriately sized integer type instead.
//
// NOTE: This is sub-optimal because in the case of (f32, f32, u32, u32)
// we could do ([f32; 2], u64) which is better but this is the best we
// can do right now.
arg.cast_to(Reg { kind: RegKind::Integer, size });
}
}
Expand Down Expand Up @@ -3237,7 +3255,7 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
arg.make_indirect();
}

_ => {},
_ => {}
}
};
fixup(&mut fn_abi.ret);
Expand Down
45 changes: 45 additions & 0 deletions src/test/assembly/x86-64-homogenous-floats.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// assembly-output: emit-asm
// needs-llvm-components: x86
// compile-flags: --target x86_64-unknown-linux-gnu
// compile-flags: -C llvm-args=--x86-asm-syntax=intel
// compile-flags: -C opt-level=3

#![crate_type = "rlib"]
#![no_std]

// CHECK-LABEL: sum_f32:
// CHECK: addss xmm0, xmm1
// CHECK-NEXT: ret
#[no_mangle]
pub fn sum_f32(a: f32, b: f32) -> f32 {
a + b
}

// CHECK-LABEL: sum_f32x2:
// CHECK: addss xmm{{[0-9]}}, xmm{{[0-9]}}
// CHECK-NEXT: addss xmm{{[0-9]}}, xmm{{[0-9]}}
// CHECK-NEXT: ret
#[no_mangle]
pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
[
a[0] + b[0],
a[1] + b[1],
]
}

// CHECK-LABEL: sum_f32x4:
// CHECK: mov rax, [[PTR_IN:.*]]
// CHECK-NEXT: movups [[XMMA:xmm[0-9]]], xmmword ptr [rsi]
// CHECK-NEXT: movups [[XMMB:xmm[0-9]]], xmmword ptr [rdx]
// CHECK-NEXT: addps [[XMMB]], [[XMMA]]
// CHECK-NEXT: movups xmmword ptr [[[PTR_IN]]], [[XMMB]]
// CHECK-NEXT: ret
#[no_mangle]
pub fn sum_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
[
a[0] + b[0],
a[1] + b[1],
a[2] + b[2],
a[3] + b[3],
]
}
32 changes: 32 additions & 0 deletions src/test/codegen/homogeneous-floats.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//! Check that small (less then 128bits on x86_64) homogeneous floats are either pass as an array
//! or by a pointer
// compile-flags: -C no-prepopulate-passes -O
// only-x86_64

#![crate_type = "lib"]

pub struct Foo {
bar1: f32,
bar2: f32,
bar3: f32,
bar4: f32,
}

// CHECK: define [2 x float] @array_f32x2([2 x float] %0, [2 x float] %1)
#[no_mangle]
pub fn array_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
todo!()
}

// CHECK: define void @array_f32x4([4 x float]* {{.*}} sret([4 x float]) {{.*}} %0, [4 x float]* {{.*}} %a, [4 x float]* {{.*}} %b)
#[no_mangle]
pub fn array_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
todo!()
}

// CHECK: define void @array_f32x4_nested(%Foo* {{.*}} sret(%Foo) {{.*}} %0, %Foo* {{.*}} %a, %Foo* {{.*}} %b)
#[no_mangle]
pub fn array_f32x4_nested(a: Foo, b: Foo) -> Foo {
todo!()
}
46 changes: 46 additions & 0 deletions src/test/ui/abi/homogenous-floats.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// This test that no matter the optimization level or the target feature enable, the non
// aggregation of homogenous floats in the abi is sound and still produce the right answer.

// revisions: opt-0 opt-0-native opt-1 opt-1-native opt-2 opt-2-native opt-3 opt-3-native
// [opt-0]: compile-flags: -C opt-level=0
// [opt-1]: compile-flags: -C opt-level=1
// [opt-2]: compile-flags: -C opt-level=2
// [opt-3]: compile-flags: -C opt-level=3
// [opt-0-native]: compile-flags: -C target-cpu=native
// [opt-1-native]: compile-flags: -C target-cpu=native
// [opt-2-native]: compile-flags: -C target-cpu=native
// [opt-3-native]: compile-flags: -C target-cpu=native
// run-pass

#![feature(core_intrinsics)]

use std::intrinsics::black_box;

pub fn sum_f32(a: f32, b: f32) -> f32 {
a + b
}

pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
[a[0] + b[0], a[1] + b[1]]
}

pub fn sum_f32x3(a: [f32; 3], b: [f32; 3]) -> [f32; 3] {
[a[0] + b[0], a[1] + b[1], a[2] + b[2]]
}

pub fn sum_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
[a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]]
}

fn main() {
assert_eq!(1., black_box(sum_f32(black_box(0.), black_box(1.))));
assert_eq!([2., 2.], black_box(sum_f32x2(black_box([2., 0.]), black_box([0., 2.]))));
assert_eq!(
[3., 3., 3.],
black_box(sum_f32x3(black_box([1., 2., 3.]), black_box([2., 1., 0.])))
);
assert_eq!(
[4., 4., 4., 4.],
black_box(sum_f32x4(black_box([1., 2., 3., 4.]), black_box([3., 2., 1., 0.])))
);
}

0 comments on commit 7b69d21

Please sign in to comment.