diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs
index d8e56e949dbcb..ea2da43ba1876 100644
--- a/compiler/rustc_middle/src/ty/layout.rs
+++ b/compiler/rustc_middle/src/ty/layout.rs
@@ -13,7 +13,8 @@ use rustc_session::{config::OptLevel, DataTypeKind, FieldInfo, SizeKind, Variant
 use rustc_span::symbol::Symbol;
 use rustc_span::{Span, DUMMY_SP};
 use rustc_target::abi::call::{
-    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, PassMode, Reg, RegKind,
+    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, HomogeneousAggregate, PassMode,
+    Reg, RegKind,
 };
 use rustc_target::abi::*;
 use rustc_target::spec::{abi::Abi as SpecAbi, HasTargetSpec, PanicStrategy, Target};
@@ -3203,10 +3204,27 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
 
                         if arg.layout.is_unsized() || size > max_by_val_size {
                             arg.make_indirect();
+                        } else if let Ok(HomogeneousAggregate::Homogeneous(Reg {
+                            kind: RegKind::Float,
+                            ..
+                        })) = arg.layout.homogeneous_aggregate(self)
+                        {
+                            // We don't want to aggregate floats as an aggregates of Integer
+                            // because this will hurt the generated assembly (#93490)
+                            //
+                            // As an optimization we want to pass homogeneous aggregate of floats
+                            // greater than pointer size as indirect
+                            if size > Pointer.size(self) {
+                                arg.make_indirect();
+                            }
                         } else {
                             // We want to pass small aggregates as immediates, but using
                             // a LLVM aggregate type for this leads to bad optimizations,
                             // so we pick an appropriately sized integer type instead.
+                            //
+                            // NOTE: This is sub-optimal because in the case of (f32, f32, u32, u32)
+                            // we could do ([f32; 2], u64) which is better but this is the best we
+                            // can do right now.
                             arg.cast_to(Reg { kind: RegKind::Integer, size });
                         }
                     }
@@ -3237,7 +3255,7 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
                         arg.make_indirect();
                     }
 
-                    _ => {},
+                    _ => {}
                 }
             };
             fixup(&mut fn_abi.ret);
diff --git a/src/test/assembly/x86-64-homogenous-floats.rs b/src/test/assembly/x86-64-homogenous-floats.rs
new file mode 100644
index 0000000000000..b2ab2aeba2936
--- /dev/null
+++ b/src/test/assembly/x86-64-homogenous-floats.rs
@@ -0,0 +1,45 @@
+// assembly-output: emit-asm
+// needs-llvm-components: x86
+// compile-flags: --target x86_64-unknown-linux-gnu
+// compile-flags: -C llvm-args=--x86-asm-syntax=intel
+// compile-flags: -C opt-level=3
+
+#![crate_type = "rlib"]
+#![no_std]
+
+// CHECK-LABEL: sum_f32:
+// CHECK:      addss xmm0, xmm1
+// CHECK-NEXT: ret
+#[no_mangle]
+pub fn sum_f32(a: f32, b: f32) -> f32 {
+    a + b
+}
+
+// CHECK-LABEL: sum_f32x2:
+// CHECK:      addss xmm{{[0-9]}}, xmm{{[0-9]}}
+// CHECK-NEXT: addss xmm{{[0-9]}}, xmm{{[0-9]}}
+// CHECK-NEXT: ret
+#[no_mangle]
+pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
+    [
+        a[0] + b[0],
+        a[1] + b[1],
+    ]
+}
+
+// CHECK-LABEL: sum_f32x4:
+// CHECK:      mov     rax, [[PTR_IN:.*]]
+// CHECK-NEXT: movups  [[XMMA:xmm[0-9]]], xmmword ptr [rsi]
+// CHECK-NEXT: movups  [[XMMB:xmm[0-9]]], xmmword ptr [rdx]
+// CHECK-NEXT: addps   [[XMMB]], [[XMMA]]
+// CHECK-NEXT: movups  xmmword ptr [[[PTR_IN]]], [[XMMB]]
+// CHECK-NEXT: ret
+#[no_mangle]
+pub fn sum_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
+    [
+        a[0] + b[0],
+        a[1] + b[1],
+        a[2] + b[2],
+        a[3] + b[3],
+    ]
+}
diff --git a/src/test/codegen/homogeneous-floats.rs b/src/test/codegen/homogeneous-floats.rs
new file mode 100644
index 0000000000000..0b729156d2842
--- /dev/null
+++ b/src/test/codegen/homogeneous-floats.rs
@@ -0,0 +1,32 @@
+//! Check that small (less then 128bits on x86_64) homogeneous floats are either pass as an array
+//! or by a pointer
+
+// compile-flags: -C no-prepopulate-passes -O
+// only-x86_64
+
+#![crate_type = "lib"]
+
+pub struct Foo {
+    bar1: f32,
+    bar2: f32,
+    bar3: f32,
+    bar4: f32,
+}
+
+// CHECK: define [2 x float] @array_f32x2([2 x float] %0, [2 x float] %1)
+#[no_mangle]
+pub fn array_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
+    todo!()
+}
+
+// CHECK: define void @array_f32x4([4 x float]* {{.*}} sret([4 x float]) {{.*}} %0, [4 x float]* {{.*}} %a, [4 x float]* {{.*}} %b)
+#[no_mangle]
+pub fn array_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
+    todo!()
+}
+
+// CHECK: define void @array_f32x4_nested(%Foo* {{.*}} sret(%Foo) {{.*}} %0, %Foo* {{.*}} %a, %Foo* {{.*}} %b)
+#[no_mangle]
+pub fn array_f32x4_nested(a: Foo, b: Foo) -> Foo {
+    todo!()
+}
diff --git a/src/test/ui/abi/homogenous-floats.rs b/src/test/ui/abi/homogenous-floats.rs
new file mode 100644
index 0000000000000..cbbcd2a47e82c
--- /dev/null
+++ b/src/test/ui/abi/homogenous-floats.rs
@@ -0,0 +1,46 @@
+// This test that no matter the optimization level or the target feature enable, the non
+// aggregation of homogenous floats in the abi is sound and still produce the right answer.
+
+// revisions: opt-0 opt-0-native opt-1 opt-1-native opt-2 opt-2-native opt-3 opt-3-native
+// [opt-0]: compile-flags: -C opt-level=0
+// [opt-1]: compile-flags: -C opt-level=1
+// [opt-2]: compile-flags: -C opt-level=2
+// [opt-3]: compile-flags: -C opt-level=3
+// [opt-0-native]: compile-flags: -C target-cpu=native
+// [opt-1-native]: compile-flags: -C target-cpu=native
+// [opt-2-native]: compile-flags: -C target-cpu=native
+// [opt-3-native]: compile-flags: -C target-cpu=native
+// run-pass
+
+#![feature(core_intrinsics)]
+
+use std::intrinsics::black_box;
+
+pub fn sum_f32(a: f32, b: f32) -> f32 {
+    a + b
+}
+
+pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
+    [a[0] + b[0], a[1] + b[1]]
+}
+
+pub fn sum_f32x3(a: [f32; 3], b: [f32; 3]) -> [f32; 3] {
+    [a[0] + b[0], a[1] + b[1], a[2] + b[2]]
+}
+
+pub fn sum_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
+    [a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]]
+}
+
+fn main() {
+    assert_eq!(1., black_box(sum_f32(black_box(0.), black_box(1.))));
+    assert_eq!([2., 2.], black_box(sum_f32x2(black_box([2., 0.]), black_box([0., 2.]))));
+    assert_eq!(
+        [3., 3., 3.],
+        black_box(sum_f32x3(black_box([1., 2., 3.]), black_box([2., 1., 0.])))
+    );
+    assert_eq!(
+        [4., 4., 4., 4.],
+        black_box(sum_f32x4(black_box([1., 2., 3., 4.]), black_box([3., 2., 1., 0.])))
+    );
+}