From 4948ae5c07a63677d2e9bf882c27079e27a83ca4 Mon Sep 17 00:00:00 2001
From: Nick Fitzgerald <fitzgen@gmail.com>
Date: Thu, 2 Nov 2023 09:14:11 -0700
Subject: [PATCH 1/5] Cranelift: Switch egraph `Cost` to a struct with named
 fields

Mechanical change.
---
 cranelift/codegen/src/egraph/cost.rs | 34 ++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 9 deletions(-)
diff --git a/cranelift/codegen/src/egraph/cost.rs b/cranelift/codegen/src/egraph/cost.rs
index 34ac26e1cd5a..8e0995fd93ee 100644
--- a/cranelift/codegen/src/egraph/cost.rs
+++ b/cranelift/codegen/src/egraph/cost.rs
@@ -31,23 +31,35 @@ use crate::ir::Opcode;
 /// that cannot be computed, or otherwise serve as a sentinel when
 /// performing search for the lowest-cost representation of a value.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
-pub(crate) struct Cost(u32);
+pub(crate) struct Cost {
+    opcode_cost: u32,
+}
+
 impl Cost {
     pub(crate) fn infinity() -> Cost {
         // 2^32 - 1 is, uh, pretty close to infinite... (we use `Cost`
         // only for heuristics and always saturate so this suffices!)
-        Cost(u32::MAX)
+        Cost {
+            opcode_cost: u32::MAX,
+        }
     }
 
     pub(crate) fn zero() -> Cost {
-        Cost(0)
+        Cost { opcode_cost: 0 }
+    }
+
+    pub(crate) fn new(opcode_cost: u32) -> Cost {
+        let cost = Cost { opcode_cost };
+        cost.finite()
     }
 
     /// Clamp this cost at a "finite" value. Can be used in
     /// conjunction with saturating ops to avoid saturating into
     /// `infinity()`.
     fn finite(self) -> Cost {
-        Cost(std::cmp::min(u32::MAX - 1, self.0))
+        Cost {
+            opcode_cost: std::cmp::min(u32::MAX - 1, self.opcode_cost),
+        }
     }
 }
 
@@ -59,8 +71,12 @@ impl std::default::Default for Cost {
 
 impl std::ops::Add<Cost> for Cost {
     type Output = Cost;
+
     fn add(self, other: Cost) -> Cost {
-        Cost(self.0.saturating_add(other.0)).finite()
+        let cost = Cost {
+            opcode_cost: self.opcode_cost.saturating_add(other.opcode_cost),
+        };
+        cost.finite()
     }
 }
 
@@ -70,11 +86,11 @@ impl std::ops::Add<Cost> for Cost {
 pub(crate) fn pure_op_cost(op: Opcode) -> Cost {
     match op {
         // Constants.
-        Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost(1),
+        Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new(1),
 
         // Extends/reduces.
         Opcode::Uextend | Opcode::Sextend | Opcode::Ireduce | Opcode::Iconcat | Opcode::Isplit => {
-            Cost(2)
+            Cost::new(2)
         }
 
         // "Simple" arithmetic.
@@ -86,9 +102,9 @@ pub(crate) fn pure_op_cost(op: Opcode) -> Cost {
         | Opcode::Bnot
         | Opcode::Ishl
         | Opcode::Ushr
-        | Opcode::Sshr => Cost(3),
+        | Opcode::Sshr => Cost::new(3),
 
         // Everything else (pure.)
-        _ => Cost(4),
+        _ => Cost::new(4),
     }
 }

From 73abf7a2ce30ce796ace93cdc2d96a3a15c1aeab Mon Sep 17 00:00:00 2001
From: Nick Fitzgerald <fitzgen@gmail.com>
Date: Thu, 2 Nov 2023 09:41:18 -0700
Subject: [PATCH 2/5] Cranelift: Break op cost ties with expression depth in
 egraphs

This means that, when the opcode cost is the same, we prefer shallow and wide
expressions to narrow and deep. For example, `(a + b) + (c + d)` is preferred to
`((a + b) + c) + d`. This is beneficial because it exposes more
instruction-level parallelism and shortens live ranges.

Co-Authored-By: Trevor Elliott <telliott@fastly.com>
---
 cranelift/codegen/src/egraph/cost.rs      | 66 ++++++++++++++++++++---
 cranelift/codegen/src/egraph/elaborate.rs | 13 ++---
 2 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/cranelift/codegen/src/egraph/cost.rs b/cranelift/codegen/src/egraph/cost.rs
index 8e0995fd93ee..8a59a8dd0456 100644
--- a/cranelift/codegen/src/egraph/cost.rs
+++ b/cranelift/codegen/src/egraph/cost.rs
@@ -30,9 +30,31 @@ use crate::ir::Opcode;
 /// `finite()` method.) An infinite cost is used to represent a value
 /// that cannot be computed, or otherwise serve as a sentinel when
 /// performing search for the lowest-cost representation of a value.
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub(crate) struct Cost {
     opcode_cost: u32,
+    depth: u32,
+}
+
+impl Ord for Cost {
+    #[inline]
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // Break `opcode_cost` ties with `depth`. This means that, when the
+        // opcode cost is the same, we prefer shallow and wide expressions to
+        // narrow and deep. For example, `(a + b) + (c + d)` is preferred to
+        // `((a + b) + c) + d`. This is beneficial because it exposes more
+        // instruction-level parallelism and shortens live ranges.
+        self.opcode_cost
+            .cmp(&other.opcode_cost)
+            .then_with(|| self.depth.cmp(&other.depth))
+    }
+}
+
+impl PartialOrd for Cost {
+    #[inline]
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
 }
 
 impl Cost {
@@ -41,15 +63,22 @@ impl Cost {
         // only for heuristics and always saturate so this suffices!)
         Cost {
             opcode_cost: u32::MAX,
+            depth: u32::MAX,
         }
     }
 
     pub(crate) fn zero() -> Cost {
-        Cost { opcode_cost: 0 }
+        Cost {
+            opcode_cost: 0,
+            depth: 0,
+        }
     }
 
     pub(crate) fn new(opcode_cost: u32) -> Cost {
-        let cost = Cost { opcode_cost };
+        let cost = Cost {
+            opcode_cost,
+            depth: 0,
+        };
         cost.finite()
     }
 
@@ -59,7 +88,28 @@ impl Cost {
     fn finite(self) -> Cost {
         Cost {
             opcode_cost: std::cmp::min(u32::MAX - 1, self.opcode_cost),
+            depth: std::cmp::min(u32::MAX - 1, self.depth),
+        }
+    }
+
+    /// Compute the cost of the operation and its given operands.
+    ///
+    /// Caller is responsible for checking that the opcode came from an instruction
+    /// that satisfies `inst_predicates::is_pure_for_egraph()`.
+    pub(crate) fn of_pure_op(op: Opcode, operand_costs: impl IntoIterator<Item = Self>) -> Self {
+        let mut c: Self = pure_op_cost(op) + operand_costs.into_iter().sum();
+        c.depth = c.depth.saturating_add(1);
+        c
+    }
+}
+
+impl std::iter::Sum<Cost> for Cost {
+    fn sum<I: Iterator<Item = Cost>>(iter: I) -> Self {
+        let mut c = Self::zero();
+        for x in iter {
+            c = c + x;
         }
+        c
     }
 }
 
@@ -75,15 +125,17 @@ impl std::ops::Add<Cost> for Cost {
     fn add(self, other: Cost) -> Cost {
         let cost = Cost {
             opcode_cost: self.opcode_cost.saturating_add(other.opcode_cost),
+            depth: std::cmp::max(self.depth, other.depth),
         };
         cost.finite()
     }
 }
 
-/// Return the cost of a *pure* opcode. Caller is responsible for
-/// checking that the opcode came from an instruction that satisfies
-/// `inst_predicates::is_pure_for_egraph()`.
-pub(crate) fn pure_op_cost(op: Opcode) -> Cost {
+/// Return the cost of a *pure* opcode.
+///
+/// Caller is responsible for checking that the opcode came from an instruction
+/// that satisfies `inst_predicates::is_pure_for_egraph()`.
+fn pure_op_cost(op: Opcode) -> Cost {
     match op {
         // Constants.
         Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new(1),
diff --git a/cranelift/codegen/src/egraph/elaborate.rs b/cranelift/codegen/src/egraph/elaborate.rs
index 18e3629a5f03..5637215ee66d 100644
--- a/cranelift/codegen/src/egraph/elaborate.rs
+++ b/cranelift/codegen/src/egraph/elaborate.rs
@@ -1,7 +1,7 @@
 //! Elaboration phase: lowers EGraph back to sequences of operations
 //! in CFG nodes.
 
-use super::cost::{pure_op_cost, Cost};
+use super::cost::Cost;
 use super::domtree::DomTreeWithChildren;
 use super::Stats;
 use crate::dominator_tree::DominatorTree;
@@ -245,13 +245,10 @@ impl<'a> Elaborator<'a> {
                         // N.B.: at this point we know that the opcode is
                         // pure, so `pure_op_cost`'s precondition is
                         // satisfied.
-                        let cost = self
-                            .func
-                            .dfg
-                            .inst_values(inst)
-                            .fold(pure_op_cost(inst_data.opcode()), |cost, value| {
-                                cost + best[value].0
-                            });
+                        let cost = Cost::of_pure_op(
+                            inst_data.opcode(),
+                            self.func.dfg.inst_values(inst).map(|value| best[value].0),
+                        );
                         best[value] = BestEntry(cost, value);
                     }
                 }

From 86a98641adb7c0674b28f4271595fdb290ae8bce Mon Sep 17 00:00:00 2001
From: Nick Fitzgerald <fitzgen@gmail.com>
Date: Thu, 2 Nov 2023 13:56:39 -0700
Subject: [PATCH 3/5] Cranelift: Bitpack the egraph `Cost` structure

Co-Authored-By: Chris Fallin <chris@cfallin.org>
Co-Authored-By: Trevor Elliott <telliott@fastly.com>
---
 cranelift/codegen/src/egraph/cost.rs | 101 ++++++++++++++++-----------
 1 file changed, 60 insertions(+), 41 deletions(-)

diff --git a/cranelift/codegen/src/egraph/cost.rs b/cranelift/codegen/src/egraph/cost.rs
index 8a59a8dd0456..22d9be814ab9 100644
--- a/cranelift/codegen/src/egraph/cost.rs
+++ b/cranelift/codegen/src/egraph/cost.rs
@@ -30,23 +30,36 @@ use crate::ir::Opcode;
 /// `finite()` method.) An infinite cost is used to represent a value
 /// that cannot be computed, or otherwise serve as a sentinel when
 /// performing search for the lowest-cost representation of a value.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) struct Cost {
-    opcode_cost: u32,
-    depth: u32,
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub(crate) struct Cost(u32);
+
+impl core::fmt::Debug for Cost {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        if *self == Cost::infinity() {
+            write!(f, "Cost::Infinite")
+        } else {
+            f.debug_struct("Cost::Finite")
+                .field("op_cost", &self.op_cost())
+                .field("depth", &self.depth())
+                .finish()
+        }
+    }
 }
 
 impl Ord for Cost {
     #[inline]
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        // Break `opcode_cost` ties with `depth`. This means that, when the
-        // opcode cost is the same, we prefer shallow and wide expressions to
-        // narrow and deep. For example, `(a + b) + (c + d)` is preferred to
-        // `((a + b) + c) + d`. This is beneficial because it exposes more
+        // We make sure that the high bits are the op cost and the low bits are
+        // the depth. This means that we can use normal integer comparison to
+        // order by op cost and then depth.
+        //
+        // We want to break op cost ties with depth (rather than the other way
+        // around). When the op cost is the same, we prefer shallow and wide
+        // expressions to narrow and deep expressions and breaking ties with
+        // `depth` gives us that. For example, `(a + b) + (c + d)` is preferred
+        // to `((a + b) + c) + d`. This is beneficial because it exposes more
         // instruction-level parallelism and shortens live ranges.
-        self.opcode_cost
-            .cmp(&other.opcode_cost)
-            .then_with(|| self.depth.cmp(&other.depth))
+        self.0.cmp(&other.0)
     }
 }
 
@@ -58,38 +71,44 @@ impl PartialOrd for Cost {
 }
 
 impl Cost {
+    const DEPTH_BITS: u8 = 8;
+    const DEPTH_MASK: u32 = (1 << Self::DEPTH_BITS) - 1;
+    const OP_COST_MASK: u32 = !Self::DEPTH_MASK;
+    const MAX_OP_COST: u32 = Self::OP_COST_MASK >> Self::DEPTH_BITS;
+
     pub(crate) fn infinity() -> Cost {
         // 2^32 - 1 is, uh, pretty close to infinite... (we use `Cost`
         // only for heuristics and always saturate so this suffices!)
-        Cost {
-            opcode_cost: u32::MAX,
-            depth: u32::MAX,
-        }
+        Cost(u32::MAX)
     }
 
     pub(crate) fn zero() -> Cost {
-        Cost {
-            opcode_cost: 0,
-            depth: 0,
-        }
+        Cost(0)
     }
 
-    pub(crate) fn new(opcode_cost: u32) -> Cost {
-        let cost = Cost {
-            opcode_cost,
-            depth: 0,
-        };
-        cost.finite()
+    fn new(opcode_cost: u32, depth: u8) -> Cost {
+        debug_assert!(
+            opcode_cost <= Self::MAX_OP_COST,
+            "Cost::new: given opcode cost of {opcode_cost} is larger than max of {}",
+            Self::MAX_OP_COST,
+        );
+        Cost((opcode_cost << Self::DEPTH_BITS) | u32::from(depth))
+    }
+
+    fn depth(&self) -> u8 {
+        let depth = self.0 & Self::DEPTH_MASK;
+        u8::try_from(depth).unwrap()
+    }
+
+    fn op_cost(&self) -> u32 {
+        (self.0 & Self::OP_COST_MASK) >> Self::DEPTH_BITS
     }
 
     /// Clamp this cost at a "finite" value. Can be used in
     /// conjunction with saturating ops to avoid saturating into
     /// `infinity()`.
     fn finite(self) -> Cost {
-        Cost {
-            opcode_cost: std::cmp::min(u32::MAX - 1, self.opcode_cost),
-            depth: std::cmp::min(u32::MAX - 1, self.depth),
-        }
+        Cost(std::cmp::min(u32::MAX - 1, self.0))
     }
 
     /// Compute the cost of the operation and its given operands.
@@ -97,9 +116,8 @@ impl Cost {
     /// Caller is responsible for checking that the opcode came from an instruction
     /// that satisfies `inst_predicates::is_pure_for_egraph()`.
     pub(crate) fn of_pure_op(op: Opcode, operand_costs: impl IntoIterator<Item = Self>) -> Self {
-        let mut c: Self = pure_op_cost(op) + operand_costs.into_iter().sum();
-        c.depth = c.depth.saturating_add(1);
-        c
+        let c = pure_op_cost(op) + operand_costs.into_iter().sum();
+        Cost::new(c.op_cost(), c.depth().saturating_add(1)).finite()
     }
 }
 
@@ -123,11 +141,12 @@ impl std::ops::Add<Cost> for Cost {
     type Output = Cost;
 
     fn add(self, other: Cost) -> Cost {
-        let cost = Cost {
-            opcode_cost: self.opcode_cost.saturating_add(other.opcode_cost),
-            depth: std::cmp::max(self.depth, other.depth),
-        };
-        cost.finite()
+        let op_cost = std::cmp::min(
+            self.op_cost().saturating_add(other.op_cost()),
+            Self::MAX_OP_COST,
+        );
+        let depth = std::cmp::max(self.depth(), other.depth());
+        Cost::new(op_cost, depth).finite()
     }
 }
 
@@ -138,11 +157,11 @@ impl std::ops::Add<Cost> for Cost {
 fn pure_op_cost(op: Opcode) -> Cost {
     match op {
         // Constants.
-        Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new(1),
+        Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new(1, 0),
 
         // Extends/reduces.
         Opcode::Uextend | Opcode::Sextend | Opcode::Ireduce | Opcode::Iconcat | Opcode::Isplit => {
-            Cost::new(2)
+            Cost::new(2, 0)
         }
 
         // "Simple" arithmetic.
@@ -154,9 +173,9 @@ fn pure_op_cost(op: Opcode) -> Cost {
         | Opcode::Bnot
         | Opcode::Ishl
         | Opcode::Ushr
-        | Opcode::Sshr => Cost::new(3),
+        | Opcode::Sshr => Cost::new(3, 0),
 
         // Everything else (pure.)
-        _ => Cost::new(4),
+        _ => Cost::new(4, 0),
     }
 }

From e88d7e7045fb92da4ee3f1eebc5756a38a924e79 Mon Sep 17 00:00:00 2001
From: Nick Fitzgerald <fitzgen@gmail.com>
Date: Tue, 7 Nov 2023 13:21:21 -0800
Subject: [PATCH 4/5] Make it so you can't construct `Cost::inifinity()` by
 accident

---
 cranelift/codegen/src/egraph/cost.rs | 36 ++++++++++++----------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/cranelift/codegen/src/egraph/cost.rs b/cranelift/codegen/src/egraph/cost.rs
index 22d9be814ab9..fd22b8db3f2b 100644
--- a/cranelift/codegen/src/egraph/cost.rs
+++ b/cranelift/codegen/src/egraph/cost.rs
@@ -74,7 +74,7 @@ impl Cost {
     const DEPTH_BITS: u8 = 8;
     const DEPTH_MASK: u32 = (1 << Self::DEPTH_BITS) - 1;
     const OP_COST_MASK: u32 = !Self::DEPTH_MASK;
-    const MAX_OP_COST: u32 = Self::OP_COST_MASK >> Self::DEPTH_BITS;
+    const MAX_OP_COST: u32 = (Self::OP_COST_MASK >> Self::DEPTH_BITS) - 1;
 
     pub(crate) fn infinity() -> Cost {
         // 2^32 - 1 is, uh, pretty close to infinite... (we use `Cost`
@@ -86,13 +86,14 @@ impl Cost {
         Cost(0)
     }
 
-    fn new(opcode_cost: u32, depth: u8) -> Cost {
-        debug_assert!(
-            opcode_cost <= Self::MAX_OP_COST,
-            "Cost::new: given opcode cost of {opcode_cost} is larger than max of {}",
-            Self::MAX_OP_COST,
-        );
-        Cost((opcode_cost << Self::DEPTH_BITS) | u32::from(depth))
+    /// Construct a new finite cost from the given parts.
+    ///
+    /// The opcode cost is clamped to the maximum value representable.
+    fn new_finite(opcode_cost: u32, depth: u8) -> Cost {
+        let opcode_cost = std::cmp::min(opcode_cost, Self::MAX_OP_COST);
+        let cost = Cost((opcode_cost << Self::DEPTH_BITS) | u32::from(depth));
+        debug_assert_ne!(cost, Cost::infinity());
+        cost
     }
 
     fn depth(&self) -> u8 {
@@ -104,20 +105,13 @@ impl Cost {
         (self.0 & Self::OP_COST_MASK) >> Self::DEPTH_BITS
     }
 
-    /// Clamp this cost at a "finite" value. Can be used in
-    /// conjunction with saturating ops to avoid saturating into
-    /// `infinity()`.
-    fn finite(self) -> Cost {
-        Cost(std::cmp::min(u32::MAX - 1, self.0))
-    }
-
     /// Compute the cost of the operation and its given operands.
     ///
     /// Caller is responsible for checking that the opcode came from an instruction
     /// that satisfies `inst_predicates::is_pure_for_egraph()`.
     pub(crate) fn of_pure_op(op: Opcode, operand_costs: impl IntoIterator<Item = Self>) -> Self {
         let c = pure_op_cost(op) + operand_costs.into_iter().sum();
-        Cost::new(c.op_cost(), c.depth().saturating_add(1)).finite()
+        Cost::new_finite(c.op_cost(), c.depth().saturating_add(1))
     }
 }
 
@@ -146,7 +140,7 @@ impl std::ops::Add<Cost> for Cost {
             Self::MAX_OP_COST,
         );
         let depth = std::cmp::max(self.depth(), other.depth());
-        Cost::new(op_cost, depth).finite()
+        Cost::new_finite(op_cost, depth)
     }
 }
 
@@ -157,11 +151,11 @@ impl std::ops::Add<Cost> for Cost {
 fn pure_op_cost(op: Opcode) -> Cost {
     match op {
         // Constants.
-        Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new(1, 0),
+        Opcode::Iconst | Opcode::F32const | Opcode::F64const => Cost::new_finite(1, 0),
 
         // Extends/reduces.
         Opcode::Uextend | Opcode::Sextend | Opcode::Ireduce | Opcode::Iconcat | Opcode::Isplit => {
-            Cost::new(2, 0)
+            Cost::new_finite(2, 0)
         }
 
         // "Simple" arithmetic.
@@ -173,9 +167,9 @@ fn pure_op_cost(op: Opcode) -> Cost {
         | Opcode::Bnot
         | Opcode::Ishl
         | Opcode::Ushr
-        | Opcode::Sshr => Cost::new(3, 0),
+        | Opcode::Sshr => Cost::new_finite(3, 0),
 
         // Everything else (pure.)
-        _ => Cost::new(4, 0),
+        _ => Cost::new_finite(4, 0),
     }
 }

From bfe8a950de60c70fdc15bc49255458e24c91ad37 Mon Sep 17 00:00:00 2001
From: Nick Fitzgerald <fitzgen@gmail.com>
Date: Tue, 7 Nov 2023 13:25:29 -0800
Subject: [PATCH 5/5] Use fold to code golf

---
 cranelift/codegen/src/egraph/cost.rs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cranelift/codegen/src/egraph/cost.rs b/cranelift/codegen/src/egraph/cost.rs
index fd22b8db3f2b..2870b61f515f 100644
--- a/cranelift/codegen/src/egraph/cost.rs
+++ b/cranelift/codegen/src/egraph/cost.rs
@@ -117,11 +117,7 @@ impl Cost {
 
 impl std::iter::Sum<Cost> for Cost {
     fn sum<I: Iterator<Item = Cost>>(iter: I) -> Self {
-        let mut c = Self::zero();
-        for x in iter {
-            c = c + x;
-        }
-        c
+        iter.fold(Self::zero(), |a, b| a + b)
     }
 }