diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs index d8e56e949dbcb..ea2da43ba1876 100644 --- a/compiler/rustc_middle/src/ty/layout.rs +++ b/compiler/rustc_middle/src/ty/layout.rs @@ -13,7 +13,8 @@ use rustc_session::{config::OptLevel, DataTypeKind, FieldInfo, SizeKind, Variant use rustc_span::symbol::Symbol; use rustc_span::{Span, DUMMY_SP}; use rustc_target::abi::call::{ - ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, PassMode, Reg, RegKind, + ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, HomogeneousAggregate, PassMode, + Reg, RegKind, }; use rustc_target::abi::*; use rustc_target::spec::{abi::Abi as SpecAbi, HasTargetSpec, PanicStrategy, Target}; @@ -3203,10 +3204,27 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> { if arg.layout.is_unsized() || size > max_by_val_size { arg.make_indirect(); + } else if let Ok(HomogeneousAggregate::Homogeneous(Reg { + kind: RegKind::Float, + .. + })) = arg.layout.homogeneous_aggregate(self) + { + // We don't want to aggregate floats as an aggregates of Integer + // because this will hurt the generated assembly (#93490) + // + // As an optimization we want to pass homogeneous aggregate of floats + // greater than pointer size as indirect + if size > Pointer.size(self) { + arg.make_indirect(); + } } else { // We want to pass small aggregates as immediates, but using // a LLVM aggregate type for this leads to bad optimizations, // so we pick an appropriately sized integer type instead. + // + // NOTE: This is sub-optimal because in the case of (f32, f32, u32, u32) + // we could do ([f32; 2], u64) which is better but this is the best we + // can do right now. arg.cast_to(Reg { kind: RegKind::Integer, size }); } } @@ -3237,7 +3255,7 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> { arg.make_indirect(); } - _ => {}, + _ => {} } }; fixup(&mut fn_abi.ret); diff --git a/src/test/assembly/x86-64-homogenous-floats.rs b/src/test/assembly/x86-64-homogenous-floats.rs new file mode 100644 index 0000000000000..b2ab2aeba2936 --- /dev/null +++ b/src/test/assembly/x86-64-homogenous-floats.rs @@ -0,0 +1,45 @@ +// assembly-output: emit-asm +// needs-llvm-components: x86 +// compile-flags: --target x86_64-unknown-linux-gnu +// compile-flags: -C llvm-args=--x86-asm-syntax=intel +// compile-flags: -C opt-level=3 + +#![crate_type = "rlib"] +#![no_std] + +// CHECK-LABEL: sum_f32: +// CHECK: addss xmm0, xmm1 +// CHECK-NEXT: ret +#[no_mangle] +pub fn sum_f32(a: f32, b: f32) -> f32 { + a + b +} + +// CHECK-LABEL: sum_f32x2: +// CHECK: addss xmm{{[0-9]}}, xmm{{[0-9]}} +// CHECK-NEXT: addss xmm{{[0-9]}}, xmm{{[0-9]}} +// CHECK-NEXT: ret +#[no_mangle] +pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] { + [ + a[0] + b[0], + a[1] + b[1], + ] +} + +// CHECK-LABEL: sum_f32x4: +// CHECK: mov rax, [[PTR_IN:.*]] +// CHECK-NEXT: movups [[XMMA:xmm[0-9]]], xmmword ptr [rsi] +// CHECK-NEXT: movups [[XMMB:xmm[0-9]]], xmmword ptr [rdx] +// CHECK-NEXT: addps [[XMMB]], [[XMMA]] +// CHECK-NEXT: movups xmmword ptr [[[PTR_IN]]], [[XMMB]] +// CHECK-NEXT: ret +#[no_mangle] +pub fn sum_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] { + [ + a[0] + b[0], + a[1] + b[1], + a[2] + b[2], + a[3] + b[3], + ] +} diff --git a/src/test/codegen/homogeneous-floats.rs b/src/test/codegen/homogeneous-floats.rs new file mode 100644 index 0000000000000..0b729156d2842 --- /dev/null +++ b/src/test/codegen/homogeneous-floats.rs @@ -0,0 +1,32 @@ +//! Check that small (less then 128bits on x86_64) homogeneous floats are either pass as an array +//! or by a pointer + +// compile-flags: -C no-prepopulate-passes -O +// only-x86_64 + +#![crate_type = "lib"] + +pub struct Foo { + bar1: f32, + bar2: f32, + bar3: f32, + bar4: f32, +} + +// CHECK: define [2 x float] @array_f32x2([2 x float] %0, [2 x float] %1) +#[no_mangle] +pub fn array_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] { + todo!() +} + +// CHECK: define void @array_f32x4([4 x float]* {{.*}} sret([4 x float]) {{.*}} %0, [4 x float]* {{.*}} %a, [4 x float]* {{.*}} %b) +#[no_mangle] +pub fn array_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] { + todo!() +} + +// CHECK: define void @array_f32x4_nested(%Foo* {{.*}} sret(%Foo) {{.*}} %0, %Foo* {{.*}} %a, %Foo* {{.*}} %b) +#[no_mangle] +pub fn array_f32x4_nested(a: Foo, b: Foo) -> Foo { + todo!() +} diff --git a/src/test/ui/abi/homogenous-floats.rs b/src/test/ui/abi/homogenous-floats.rs new file mode 100644 index 0000000000000..cbbcd2a47e82c --- /dev/null +++ b/src/test/ui/abi/homogenous-floats.rs @@ -0,0 +1,46 @@ +// This test that no matter the optimization level or the target feature enable, the non +// aggregation of homogenous floats in the abi is sound and still produce the right answer. + +// revisions: opt-0 opt-0-native opt-1 opt-1-native opt-2 opt-2-native opt-3 opt-3-native +// [opt-0]: compile-flags: -C opt-level=0 +// [opt-1]: compile-flags: -C opt-level=1 +// [opt-2]: compile-flags: -C opt-level=2 +// [opt-3]: compile-flags: -C opt-level=3 +// [opt-0-native]: compile-flags: -C target-cpu=native +// [opt-1-native]: compile-flags: -C target-cpu=native +// [opt-2-native]: compile-flags: -C target-cpu=native +// [opt-3-native]: compile-flags: -C target-cpu=native +// run-pass + +#![feature(core_intrinsics)] + +use std::intrinsics::black_box; + +pub fn sum_f32(a: f32, b: f32) -> f32 { + a + b +} + +pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] { + [a[0] + b[0], a[1] + b[1]] +} + +pub fn sum_f32x3(a: [f32; 3], b: [f32; 3]) -> [f32; 3] { + [a[0] + b[0], a[1] + b[1], a[2] + b[2]] +} + +pub fn sum_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] { + [a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]] +} + +fn main() { + assert_eq!(1., black_box(sum_f32(black_box(0.), black_box(1.)))); + assert_eq!([2., 2.], black_box(sum_f32x2(black_box([2., 0.]), black_box([0., 2.])))); + assert_eq!( + [3., 3., 3.], + black_box(sum_f32x3(black_box([1., 2., 3.]), black_box([2., 1., 0.]))) + ); + assert_eq!( + [4., 4., 4., 4.], + black_box(sum_f32x4(black_box([1., 2., 3., 4.]), black_box([3., 2., 1., 0.]))) + ); +}