rust-lang · alexcrichton · Jun 19, 2017 · Jun 12, 2017
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
@@ -87,10 +87,134 @@ STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
 STATISTIC(StackSlotMerged, "Number of stack slot merged.");
 STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
 
+//===----------------------------------------------------------------------===//
+//                           StackColoring Pass
+//===----------------------------------------------------------------------===//
+//
+// Stack Coloring reduces stack usage by merging stack slots when they
+// can't be used together. For example, consider the following C program:
+//
+//     void bar(char *, int);
+//     void foo(bool var) {
+//         A: {
+//             char z[4096];
+//             bar(z, 0);
+//         }
+//
+//         char *p;
+//         char x[4096];
+//         char y[4096];
+//         if (var) {
+//             p = x;
+//         } else {
+//             bar(y, 1);
+//             p = y + 1024;
+//         }
+//     B:
+//         bar(p, 2);
+//     }
+//
+// Naively-compiled, this program would use 12k of stack space. However, the
+// stack slot corresponding to `z` is always destroyed before either of the
+// stack slots for `x` or `y` are used, and then `x` is only used if `var`
+// is true, while `y` is only used if `var` is false. So in no time are 2
+// of the stack slots used together, and therefore we can merge them,
+// compiling the function using only a single 4k alloca:
+//
+//     void foo(bool var) { // equivalent
+//         char x[4096];
+//         char *p;
+//         bar(x, 0);
+//         if (var) {
+//             p = x;
+//         } else {
+//             bar(x, 1);
+//             p = x + 1024;
+//         }
+//         bar(p, 2);
+//     }
+//
+// This is an important optimization if we want stack space to be under
+// control in large functions, both open-coded ones and ones created by
+// inlining.
 //
 // Implementation Notes:
 // ---------------------
 //
+// An important part of the above reasoning is that `z` can't be accessed
+// while the latter 2 calls to `bar` are running. This is justified because
+// `z`'s lifetime is over after we exit from block `A:`, so any further
+// accesses to it would be UB. The way we represent this information
+// in LLVM is by having frontends delimit blocks with `lifetime.start`
+// and `lifetime.end` intrinsics.
+//
+// The effect of these intrinsics seems to be as follows (maybe I should
+// specify this in the reference?):
+//
+//   L1) at start, each stack-slot is marked as *out-of-scope*, unless no
+//   lifetime intrinsic refers to that stack slot, in which case
+//   it is marked as *in-scope*.
+//   L2) on a `lifetime.start`, a stack slot is marked as *in-scope* and
+//   the stack slot is overwritten with `undef`.
+//   L3) on a `lifetime.end`, a stack slot is marked as *out-of-scope*.
+//   L4) on function exit, all stack slots are marked as *out-of-scope*.
+//   L5) `lifetime.end` is a no-op when called on a slot that is already
+//   *out-of-scope*.
+//   L6) memory accesses to *out-of-scope* stack slots are UB.
+//   L7) when a stack-slot is marked as *out-of-scope*, all pointers to it
+//   are invalidated, unless the slot is "degenerate". This is used to
+//   justify not marking slots as in-use until the pointer to them is
+//   used, but feels a bit hacky in the presence of things like LICM. See
+//   the "Degenerate Slots" section for more details.
+//
+// Now, let's ground stack coloring on these rules. We'll define a slot
+// as *in-use* at a (dynamic) point in execution if it either can be
+// written to at that point, or if it has a live and non-undef content
+// at that point.
+//
+// Obviously, slots that are never *in-use* together can be merged, and
+// in our example `foo`, the slots for `x`, `y` and `z` are never
+// in-use together (of course, sometimes slots that *are* in-use together
+// might still be mergable, but we don't care about that here).
+//
+// In this implementation, we successively merge pairs of slots that are
+// not *in-use* together. We could be smarter - for example, we could merge
+// a single large slot with 2 small slots, or we could construct the
+// interference graph and run a "smart" graph coloring algorithm, but with
+// that aside, how do we find out whether a pair of slots might be *in-use*
+// together?
+//
+// From our rules, we see that *out-of-scope* slots are never *in-use*,
+// and from (L7) we see that "non-degenerate" slots remain non-*in-use*
+// until their address is taken. Therefore, we can approximate slot activity
+// using dataflow.
+//
+// A subtle point: naively, we might try to figure out which pairs of
+// stack-slots interfere by propagating `S in-use` through the CFG for every
+// stack-slot `S`, and having `S` and `T` interfere if there is a CFG point in
+// which they are both *in-use*.
+//
+// That is sound, but overly conservative in some cases: in our (artificial)
+// example `foo`, either `x` or `y` might be in use at the label `B:`, but
+// as `x` is only in use if we came in from the `var` edge and `y` only
+// if we came from the `!var` edge, they still can't be in use together.
+// See PR32488 for an important real-life case.
+//
+// If we wanted to find all points of interference precisely, we could
+// propagate `S in-use` and `S&T in-use` predicates through the CFG. That
+// would be precise, but requires propagating `O(n^2)` dataflow facts.
+//
+// However, we aren't interested in the *set* of points of interference
+// between 2 stack slots, only *whether* there *is* such a point. So we
+// can rely on a little trick: for `S` and `T` to be in-use together,
+// one of them needs to become in-use while the other is in-use (or
+// they might both become in use simultaneously). We can check this
+// by also keeping track of the points at which a stack slot might *start*
+// being in-use.
+//
+// Exact first use:
+// ----------------
+//
 // Consider the following motivating example:
 //
 //     int foo() {
@@ -159,6 +283,9 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
 // lifetime, we can additionally overlap b1 and b5, giving us a 3*1024
 // byte stack (better).
 //
+// Degenerate Slots:
+// -----------------
+//
 // Relying entirely on first-use of stack slots is problematic,
 // however, due to the fact that optimizations can sometimes migrate
 // uses of a variable outside of its lifetime start/end region. Here
@@ -238,10 +365,6 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
 // for "b" then it will appear that 'b' has a degenerate lifetime.
 //
 
-//===----------------------------------------------------------------------===//
-//                           StackColoring Pass
-//===----------------------------------------------------------------------===//
-
 namespace {
 /// StackColoring - A machine pass for merging disjoint stack allocations,
 /// marked by the LIFETIME_START and LIFETIME_END pseudo instructions.
@@ -272,8 +395,11 @@ class StackColoring : public MachineFunctionPass {
   /// Maps basic blocks to a serial number.
   SmallVector<const MachineBasicBlock*, 8> BasicBlockNumbering;
 
-  /// Maps liveness intervals for each slot.
+  /// Maps slots to their use interval. Outside of this interval, slots
+  /// values are either dead or `undef` and they will not be written to.
   SmallVector<std::unique_ptr<LiveInterval>, 16> Intervals;
+  /// Maps slots to the points where they can become in-use.
+  SmallVector<SmallVector<SlotIndex, 4>, 16> LiveStarts;
   /// VNInfo is used for the construction of LiveIntervals.
   VNInfo::Allocator VNInfoAllocator;
   /// SlotIndex analysis object.
@@ -676,15 +802,22 @@ void StackColoring::calculateLocalLiveness()
 
 void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
   SmallVector<SlotIndex, 16> Starts;
-  SmallVector<SlotIndex, 16> Finishes;
+  SmallVector<bool, 16> DefinitelyInUse;
 
   // For each block, find which slots are active within this block
   // and update the live intervals.
   for (const MachineBasicBlock &MBB : *MF) {
     Starts.clear();
     Starts.resize(NumSlots);
-    Finishes.clear();
-    Finishes.resize(NumSlots);
+    DefinitelyInUse.clear();
+    DefinitelyInUse.resize(NumSlots);
+
+    // Start the interval of the slots that we previously found to be 'in-use'.
+    BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
+    for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1;
+         pos = MBBLiveness.LiveIn.find_next(pos)) {
+      Starts[pos] = Indexes->getMBBStartIdx(&MBB);
+    }
 
     // Create the interval for the basic blocks containing lifetime begin/end.
     for (const MachineInstr &MI : MBB) {
@@ -696,68 +829,35 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
       SlotIndex ThisIndex = Indexes->getInstructionIndex(MI);
       for (auto Slot : slots) {
         if (IsStart) {
-          if (!Starts[Slot].isValid() || Starts[Slot] > ThisIndex)
+          // If a slot is already definitely in use, we don't have to emit
+          // a new start marker because there is already a pre-existing
+          // one.
+          if (!DefinitelyInUse[Slot]) {
+            LiveStarts[Slot].push_back(ThisIndex);
+            DefinitelyInUse[Slot] = true;
+          }
+          if (!Starts[Slot].isValid())
             Starts[Slot] = ThisIndex;
         } else {
-          if (!Finishes[Slot].isValid() || Finishes[Slot] < ThisIndex)
-            Finishes[Slot] = ThisIndex;
+          if (Starts[Slot].isValid()) {
+            VNInfo *VNI = Intervals[Slot]->getValNumInfo(0);
+            Intervals[Slot]->addSegment(
+                LiveInterval::Segment(Starts[Slot], ThisIndex, VNI));
+            Starts[Slot] = SlotIndex(); // Invalidate the start index
+            DefinitelyInUse[Slot] = false;
+          }
         }
       }
     }
 
-    // Create the interval of the blocks that we previously found to be 'alive'.
-    BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
-    for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1;
-         pos = MBBLiveness.LiveIn.find_next(pos)) {
-      Starts[pos] = Indexes->getMBBStartIdx(&MBB);
-    }
-    for (int pos = MBBLiveness.LiveOut.find_first(); pos != -1;
-         pos = MBBLiveness.LiveOut.find_next(pos)) {
-      Finishes[pos] = Indexes->getMBBEndIdx(&MBB);
-    }
-
+    // Finish up started segments
     for (unsigned i = 0; i < NumSlots; ++i) {
-      //
-      // When LifetimeStartOnFirstUse is turned on, data flow analysis
-      // is forward (from starts to ends), not bidirectional. A
-      // consequence of this is that we can wind up in situations
-      // where Starts[i] is invalid but Finishes[i] is valid and vice
-      // versa. Example:
-      //
-      //     LIFETIME_START x
-      //     if (...) {
-      //       <use of x>
-      //       throw ...;
-      //     }
-      //     LIFETIME_END x
-      //     return 2;
-      //
-      //
-      // Here the slot for "x" will not be live into the block
-      // containing the "return 2" (since lifetimes start with first
-      // use, not at the dominating LIFETIME_START marker).
-      //
-      if (Starts[i].isValid() && !Finishes[i].isValid()) {
-        Finishes[i] = Indexes->getMBBEndIdx(&MBB);
-      }
       if (!Starts[i].isValid())
         continue;
 
-      assert(Starts[i] && Finishes[i] && "Invalid interval");
-      VNInfo *ValNum = Intervals[i]->getValNumInfo(0);
-      SlotIndex S = Starts[i];
-      SlotIndex F = Finishes[i];
-      if (S < F) {
-        // We have a single consecutive region.
-        Intervals[i]->addSegment(LiveInterval::Segment(S, F, ValNum));
-      } else {
-        // We have two non-consecutive regions. This happens when
-        // LIFETIME_START appears after the LIFETIME_END marker.
-        SlotIndex NewStart = Indexes->getMBBStartIdx(&MBB);
-        SlotIndex NewFin = Indexes->getMBBEndIdx(&MBB);
-        Intervals[i]->addSegment(LiveInterval::Segment(NewStart, F, ValNum));
-        Intervals[i]->addSegment(LiveInterval::Segment(S, NewFin, ValNum));
-      }
+      SlotIndex EndIdx = Indexes->getMBBEndIdx(&MBB);
+      VNInfo *VNI = Intervals[i]->getValNumInfo(0);
+      Intervals[i]->addSegment(LiveInterval::Segment(Starts[i], EndIdx, VNI));
     }
   }
 }
@@ -987,6 +1087,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
   BasicBlockNumbering.clear();
   Markers.clear();
   Intervals.clear();
+  LiveStarts.clear();
   VNInfoAllocator.Reset();
 
   unsigned NumSlots = MFI->getObjectIndexEnd();
@@ -998,6 +1099,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
   SmallVector<int, 8> SortedSlots;
   SortedSlots.reserve(NumSlots);
   Intervals.reserve(NumSlots);
+  LiveStarts.resize(NumSlots);
 
   unsigned NumMarkers = collectMarkers(NumSlots);
 
@@ -1069,6 +1171,9 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
     return MFI->getObjectSize(LHS) > MFI->getObjectSize(RHS);
   });
 
+  for (auto &s : LiveStarts)
+    std::sort(s.begin(), s.end());
+
   bool Changed = true;
   while (Changed) {
     Changed = false;
@@ -1084,12 +1189,22 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
         int SecondSlot = SortedSlots[J];
         LiveInterval *First = &*Intervals[FirstSlot];
         LiveInterval *Second = &*Intervals[SecondSlot];
+        auto &FirstS = LiveStarts[FirstSlot];
+        auto &SecondS = LiveStarts[SecondSlot];
         assert (!First->empty() && !Second->empty() && "Found an empty range");
 
-        // Merge disjoint slots.
-        if (!First->overlaps(*Second)) {
+        // Merge disjoint slots. This is a little bit tricky - see the
+        // Implementation Notes section for an explanation.
+        if (!First->isLiveAtIndexes(SecondS) &&
+            !Second->isLiveAtIndexes(FirstS)) {
           Changed = true;
           First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0));
+
+          int OldSize = FirstS.size();
+          FirstS.append(SecondS.begin(), SecondS.end());
+          auto Mid = FirstS.begin() + OldSize;
+          std::inplace_merge(FirstS.begin(), Mid, FirstS.end());
+
           SlotRemap[SecondSlot] = FirstSlot;
           SortedSlots[J] = -1;
           DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<<