From 5df2af8b0ef33f48b1ee72bcd27bc609b898da52 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson@amd.com>
Date: Sat, 20 Mar 2021 10:29:08 +0900
Subject: [PATCH] [AMDGPU] Merge SIRemoveShortExecBranches into
 SIPreEmitPeephole

SIRemoveShortExecBranches is an optimisation so fits well in the
context of SIPreEmitPeephole.

Test changes relate to early termination from kills which have now
been lowered prior to considering branches for removal.
As these use s_cbranch the execz skips are now retained instead.
Currently either behaviour is valid as kill with EXEC=0 is a nop;
however, if early termination is used differently in future then
the new behaviour is the correct one.

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D98917
---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   3 -
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   2 -
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 -
 llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp  |  90 +++++++++-
 .../AMDGPU/SIRemoveShortExecBranches.cpp      | 159 ------------------
 .../GlobalISel/llvm.amdgcn.wqm.demote.ll      |  72 +++++---
 .../AMDGPU/insert-skips-flat-vmem-ds.mir      |   2 +-
 llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir |   2 +-
 .../AMDGPU/insert-skips-ignored-insts.mir     |   2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll  |  72 +++++---
 ...emove-short-exec-branches-gpr-idx-mode.mir |   2 +-
 ...ort-exec-branches-special-instructions.mir |   2 +-
 .../AMDGPU/skip-branch-taildup-ret.mir        |   2 +-
 llvm/test/CodeGen/AMDGPU/skip-if-dead.ll      |  18 +-
 .../transform-block-with-return-to-epilog.ll  |  15 +-
 .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn |   1 -
 16 files changed, 211 insertions(+), 234 deletions(-)
 delete mode 100644 llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index cdd59fe0b847b6..4f9f888506b7ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -204,9 +204,6 @@ extern char &SIWholeQuadModeID;
 void initializeSILowerControlFlowPass(PassRegistry &);
 extern char &SILowerControlFlowID;
 
-void initializeSIRemoveShortExecBranchesPass(PassRegistry &);
-extern char &SIRemoveShortExecBranchesID;
-
 void initializeSIPreEmitPeepholePass(PassRegistry &);
 extern char &SIPreEmitPeepholeID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9db4e8c8472fdd..2b42f9e1281ebd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -249,7 +249,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIModeRegisterPass(*PR);
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
-  initializeSIRemoveShortExecBranchesPass(*PR);
   initializeSIPreEmitPeepholePass(*PR);
   initializeSIInsertSkipsPass(*PR);
   initializeSIMemoryLegalizerPass(*PR);
@@ -1215,7 +1214,6 @@ void GCNPassConfig::addPreEmitPass() {
   if (getOptLevel() > CodeGenOpt::None)
     addPass(&SIInsertHardClausesID);
 
-  addPass(&SIRemoveShortExecBranchesID);
   addPass(&SIInsertSkipsPassID);
   addPass(&SIPreEmitPeepholeID);
   // The hazard recognizer that runs as part of the post-ra scheduler does not
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 7aa256821167e9..03b0c0f45f2dc6 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -137,7 +137,6 @@ add_llvm_target(AMDGPUCodeGen
   SIPreEmitPeephole.cpp
   SIProgramInfo.cpp
   SIRegisterInfo.cpp
-  SIRemoveShortExecBranches.cpp
   SIShrinkInstructions.cpp
   SIWholeQuadMode.cpp
   GCNILPSched.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 5f10fefa469fcc..93d33fddff52a8 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -21,6 +21,14 @@ using namespace llvm;
 
 #define DEBUG_TYPE "si-pre-emit-peephole"
 
+static unsigned SkipThreshold;
+
+static cl::opt<unsigned, true> SkipThresholdFlag(
+    "amdgpu-skip-threshold", cl::Hidden,
+    cl::desc(
+        "Number of instructions before jumping over divergent control flow"),
+    cl::location(SkipThreshold), cl::init(12));
+
 namespace {
 
 class SIPreEmitPeephole : public MachineFunctionPass {
@@ -30,6 +38,13 @@ class SIPreEmitPeephole : public MachineFunctionPass {
 
   bool optimizeVccBranch(MachineInstr &MI) const;
   bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
+  bool getBlockDestinations(MachineBasicBlock &SrcMBB,
+                            MachineBasicBlock *&TrueMBB,
+                            MachineBasicBlock *&FalseMBB,
+                            SmallVectorImpl<MachineOperand> &Cond);
+  bool mustRetainExeczBranch(const MachineBasicBlock &From,
+                             const MachineBasicBlock &To) const;
+  bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
 
 public:
   static char ID;
@@ -258,6 +273,74 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
   return true;
 }
 
+bool SIPreEmitPeephole::getBlockDestinations(
+    MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
+    MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
+  if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
+    return false;
+
+  if (!FalseMBB)
+    FalseMBB = SrcMBB.getNextNode();
+
+  return true;
+}
+
+bool SIPreEmitPeephole::mustRetainExeczBranch(
+    const MachineBasicBlock &From, const MachineBasicBlock &To) const {
+  unsigned NumInstr = 0;
+  const MachineFunction *MF = From.getParent();
+
+  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+       MBBI != End && MBBI != ToI; ++MBBI) {
+    const MachineBasicBlock &MBB = *MBBI;
+
+    for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      // When a uniform loop is inside non-uniform control flow, the branch
+      // leaving the loop might never be taken when EXEC = 0.
+      // Hence we should retain cbranch out of the loop lest it become infinite.
+      if (I->isConditionalBranch())
+        return true;
+
+      if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
+        return true;
+
+      // These instructions are potentially expensive even if EXEC = 0.
+      if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
+          TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
+        return true;
+
+      ++NumInstr;
+      if (NumInstr >= SkipThreshold)
+        return true;
+    }
+  }
+
+  return false;
+}
+
+// Returns true if the skip branch instruction is removed.
+bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
+                                          MachineBasicBlock &SrcMBB) {
+  MachineBasicBlock *TrueMBB = nullptr;
+  MachineBasicBlock *FalseMBB = nullptr;
+  SmallVector<MachineOperand, 1> Cond;
+
+  if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
+    return false;
+
+  // Consider only the forward branches.
+  if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
+      mustRetainExeczBranch(*FalseMBB, *TrueMBB))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
+  MI.eraseFromParent();
+  SrcMBB.removeSuccessor(TrueMBB);
+
+  return true;
+}
+
 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
@@ -265,10 +348,12 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
   MachineBasicBlock *EmptyMBBAtEnd = nullptr;
   bool Changed = false;
 
+  MF.RenumberBlocks();
+
   for (MachineBasicBlock &MBB : MF) {
     MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
     MachineBasicBlock::iterator TermI = MBBE;
-    // Check first terminator for VCC branches to optimize
+    // Check first terminator for branches to optimize
     if (TermI != MBB.end()) {
       MachineInstr &MI = *TermI;
       switch (MI.getOpcode()) {
@@ -276,6 +361,9 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
       case AMDGPU::S_CBRANCH_VCCNZ:
         Changed |= optimizeVccBranch(MI);
         continue;
+      case AMDGPU::S_CBRANCH_EXECZ:
+        Changed |= removeExeczBranch(MI, MBB);
+        continue;
       default:
         break;
       }
diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
deleted file mode 100644
index 104dea8fdff519..00000000000000
--- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-//===-- SIRemoveShortExecBranches.cpp ------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass optmizes the s_cbranch_execz instructions.
-/// The pass removes this skip instruction for short branches,
-/// if there is no unwanted sideeffect in the fallthrough code sequence.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Support/CommandLine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-remove-short-exec-branches"
-
-static unsigned SkipThreshold;
-
-static cl::opt<unsigned, true> SkipThresholdFlag(
-    "amdgpu-skip-threshold", cl::Hidden,
-    cl::desc(
-        "Number of instructions before jumping over divergent control flow"),
-    cl::location(SkipThreshold), cl::init(12));
-
-namespace {
-
-class SIRemoveShortExecBranches : public MachineFunctionPass {
-private:
-  const SIInstrInfo *TII = nullptr;
-  bool getBlockDestinations(MachineBasicBlock &SrcMBB,
-                            MachineBasicBlock *&TrueMBB,
-                            MachineBasicBlock *&FalseMBB,
-                            SmallVectorImpl<MachineOperand> &Cond);
-  bool mustRetainExeczBranch(const MachineBasicBlock &From,
-                             const MachineBasicBlock &To) const;
-  bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
-
-public:
-  static char ID;
-
-  SIRemoveShortExecBranches() : MachineFunctionPass(ID) {
-    initializeSIRemoveShortExecBranchesPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS(SIRemoveShortExecBranches, DEBUG_TYPE,
-                "SI remove short exec branches", false, false)
-
-char SIRemoveShortExecBranches::ID = 0;
-
-char &llvm::SIRemoveShortExecBranchesID = SIRemoveShortExecBranches::ID;
-
-bool SIRemoveShortExecBranches::getBlockDestinations(
-    MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
-    MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
-  if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
-    return false;
-
-  if (!FalseMBB)
-    FalseMBB = SrcMBB.getNextNode();
-
-  return true;
-}
-
-bool SIRemoveShortExecBranches::mustRetainExeczBranch(
-    const MachineBasicBlock &From, const MachineBasicBlock &To) const {
-  unsigned NumInstr = 0;
-  const MachineFunction *MF = From.getParent();
-
-  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
-       MBBI != End && MBBI != ToI; ++MBBI) {
-    const MachineBasicBlock &MBB = *MBBI;
-
-    for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      // When a uniform loop is inside non-uniform control flow, the branch
-      // leaving the loop might never be taken when EXEC = 0.
-      // Hence we should retain cbranch out of the loop lest it become infinite.
-      if (I->isConditionalBranch())
-        return true;
-
-      if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
-        return true;
-
-      if (TII->isKillTerminator(I->getOpcode()))
-        return true;
-
-      // These instructions are potentially expensive even if EXEC = 0.
-      if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
-          TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
-        return true;
-
-      ++NumInstr;
-      if (NumInstr >= SkipThreshold)
-        return true;
-    }
-  }
-
-  return false;
-}
-
-// Returns true if the skip branch instruction is removed.
-bool SIRemoveShortExecBranches::removeExeczBranch(MachineInstr &MI,
-                                                  MachineBasicBlock &SrcMBB) {
-  MachineBasicBlock *TrueMBB = nullptr;
-  MachineBasicBlock *FalseMBB = nullptr;
-  SmallVector<MachineOperand, 1> Cond;
-
-  if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
-    return false;
-
-  // Consider only the forward branches.
-  if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
-      mustRetainExeczBranch(*FalseMBB, *TrueMBB))
-    return false;
-
-  LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
-  MI.eraseFromParent();
-  SrcMBB.removeSuccessor(TrueMBB);
-
-  return true;
-}
-
-bool SIRemoveShortExecBranches::runOnMachineFunction(MachineFunction &MF) {
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  TII = ST.getInstrInfo();
-  MF.RenumberBlocks();
-  bool Changed = false;
-
-  for (MachineBasicBlock &MBB : MF) {
-    MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
-    if (MBBI == MBB.end())
-      continue;
-
-    MachineInstr &MI = *MBBI;
-    switch (MI.getOpcode()) {
-    case AMDGPU::S_CBRANCH_EXECZ:
-      Changed = removeExeczBranch(MI, MBB);
-      break;
-    default:
-      break;
-    }
-  }
-
-  return Changed;
-}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
index f0d76065ddd5a0..1b8689d10a1e2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
@@ -166,12 +166,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; SI-NEXT:    s_xor_b64 s[2:3], vcc, -1
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; SI-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB2_3
 ; SI-NEXT:  ; %bb.1: ; %.demote
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB2_4
 ; SI-NEXT:  ; %bb.2: ; %.demote
 ; SI-NEXT:    s_mov_b64 exec, 0
-; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:  BB2_3: ; %.continue
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
 ; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -192,12 +193,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX9-NEXT:    s_xor_b64 s[2:3], vcc, -1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz BB2_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB2_4
 ; GFX9-NEXT:  ; %bb.2: ; %.demote
 ; GFX9-NEXT:    s_mov_b64 exec, 0
-; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:  BB2_3: ; %.continue
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
 ; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -218,12 +220,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX10-32-NEXT:    s_xor_b32 s1, vcc_lo, -1
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
 ; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX10-32-NEXT:    s_cbranch_execz BB2_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB2_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:  BB2_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
 ; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -244,12 +247,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], vcc, -1
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB2_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB2_4
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-64-NEXT:    s_mov_b64 exec, 0
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:  BB2_3: ; %.continue
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
 ; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -284,13 +288,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:    s_cbranch_execz BB3_3
 ; SI-NEXT:  ; %bb.1: ; %.demote
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; SI-NEXT:    s_cbranch_scc0 BB3_4
 ; SI-NEXT:  ; %bb.2: ; %.demote
 ; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
-; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:  BB3_3: ; %.continue
 ; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -312,13 +317,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:    s_cbranch_execz BB3_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote
 ; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX9-NEXT:  ; %bb.2: ; %.demote
 ; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:  BB3_3: ; %.continue
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -340,13 +346,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v1
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:    s_cbranch_execz BB3_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-32-NEXT:    s_wqm_b32 s28, s12
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:  BB3_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
@@ -368,13 +375,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:    s_cbranch_execz BB3_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:  BB3_3: ; %.continue
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
 ; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
@@ -416,13 +424,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:    s_cbranch_execz BB4_3
 ; SI-NEXT:  ; %bb.1: ; %.demote
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; SI-NEXT:    s_cbranch_scc0 BB4_4
 ; SI-NEXT:  ; %bb.2: ; %.demote
 ; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
-; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:  BB4_3: ; %.continue
 ; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v0
 ; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -444,13 +453,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:    s_cbranch_execz BB4_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote
 ; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX9-NEXT:  ; %bb.2: ; %.demote
 ; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:  BB4_3: ; %.continue
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -472,13 +482,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:    s_cbranch_execz BB4_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-32-NEXT:    s_wqm_b32 s28, s12
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:  BB4_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
@@ -500,13 +511,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:    s_cbranch_execz BB4_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:  BB4_3: ; %.continue
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
 ; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -660,13 +672,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB6_3
 ; SI-NEXT:  ; %bb.1: ; %.demote0
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB6_7
 ; SI-NEXT:  ; %bb.2: ; %.demote0
 ; SI-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; SI-NEXT:    s_and_b64 exec, exec, s[6:7]
-; SI-NEXT:  ; %bb.3: ; %.continue0
+; SI-NEXT:  BB6_3: ; %.continue0
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[4:5]
@@ -682,12 +695,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; SI-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; SI-NEXT:    s_cbranch_execz BB6_6
 ; SI-NEXT:  ; %bb.4: ; %.demote1
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB6_7
 ; SI-NEXT:  ; %bb.5: ; %.demote1
 ; SI-NEXT:    s_mov_b64 exec, 0
-; SI-NEXT:  ; %bb.6: ; %.continue1
+; SI-NEXT:  BB6_6: ; %.continue1
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
@@ -706,13 +720,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz BB6_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote0
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX9-NEXT:  ; %bb.2: ; %.demote0
 ; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
-; GFX9-NEXT:  ; %bb.3: ; %.continue0
+; GFX9-NEXT:  BB6_3: ; %.continue0
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
@@ -728,12 +743,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz BB6_6
 ; GFX9-NEXT:  ; %bb.4: ; %.demote1
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX9-NEXT:  ; %bb.5: ; %.demote1
 ; GFX9-NEXT:    s_mov_b64 exec, 0
-; GFX9-NEXT:  ; %bb.6: ; %.continue1
+; GFX9-NEXT:  BB6_6: ; %.continue1
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -752,13 +768,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-32-NEXT:    s_cbranch_execz BB6_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-32-NEXT:    s_wqm_b32 s2, s0
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-32-NEXT:  BB6_3: ; %.continue0
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-32-NEXT:    s_mov_b32 s1, s0
 ; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
@@ -772,12 +789,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-32-NEXT:    s_xor_b32 s1, s1, -1
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
 ; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX10-32-NEXT:    s_cbranch_execz BB6_6
 ; GFX10-32-NEXT:  ; %bb.4: ; %.demote1
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-32-NEXT:  ; %bb.5: ; %.demote1
 ; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
-; GFX10-32-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-32-NEXT:  BB6_6: ; %.continue1
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -796,13 +814,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10-64-NEXT:    s_cbranch_execz BB6_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-64-NEXT:  BB6_3: ; %.continue0
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
@@ -816,12 +835,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB6_6
 ; GFX10-64-NEXT:  ; %bb.4: ; %.demote1
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-64-NEXT:  ; %bb.5: ; %.demote1
 ; GFX10-64-NEXT:    s_mov_b64 exec, 0
-; GFX10-64-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-64-NEXT:  BB6_6: ; %.continue1
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -883,13 +903,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB7_3
 ; SI-NEXT:  ; %bb.1: ; %.demote0
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB7_9
 ; SI-NEXT:  ; %bb.2: ; %.demote0
 ; SI-NEXT:    s_wqm_b64 s[8:9], s[0:1]
 ; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
-; SI-NEXT:  ; %bb.3: ; %.continue0.preheader
+; SI-NEXT:  BB7_3: ; %.continue0.preheader
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
@@ -948,13 +969,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz BB7_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote0
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX9-NEXT:  ; %bb.2: ; %.demote0
 ; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
-; GFX9-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX9-NEXT:  BB7_3: ; %.continue0.preheader
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
@@ -1013,13 +1035,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX10-32-NEXT:    s_cbranch_execz BB7_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-32-NEXT:    s_wqm_b32 s3, s0
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-32-NEXT:  BB7_3: ; %.continue0.preheader
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX10-32-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX10-32-NEXT:    s_branch BB7_5
@@ -1075,13 +1098,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB7_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-64-NEXT:  BB7_3: ; %.continue0.preheader
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX10-64-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-64-NEXT:    s_mov_b64 s[2:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir
index 6ce629a0dc0559..7b37990dfa45e5 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-remove-short-exec-branches -amdgpu-skip-threshold=1 -verify-machineinstrs  %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=1 -verify-machineinstrs  %s -o - | FileCheck %s
 
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir
index 5424ad39b4d9c6..95b53736721977 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-remove-short-exec-branches -amdgpu-skip-threshold=1 -verify-machineinstrs  %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=1 -verify-machineinstrs  %s -o - | FileCheck %s
 # Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0
 
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir
index 928324492d51eb..97c8b50c50cb06 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass si-remove-short-exec-branches -amdgpu-skip-threshold=3 %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=3 %s -o - | FileCheck %s
 
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index 0b0fb98cacb8eb..9edd1a397b78d3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -167,12 +167,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
 ; SI-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB2_3
 ; SI-NEXT:  ; %bb.1: ; %.demote
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
 ; SI-NEXT:    s_cbranch_scc0 BB2_4
 ; SI-NEXT:  ; %bb.2: ; %.demote
 ; SI-NEXT:    s_mov_b64 exec, 0
-; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:  BB2_3: ; %.continue
 ; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
 ; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -194,12 +195,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz BB2_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB2_4
 ; GFX9-NEXT:  ; %bb.2: ; %.demote
 ; GFX9-NEXT:    s_mov_b64 exec, 0
-; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:  BB2_3: ; %.continue
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
 ; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -221,12 +223,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX10-32-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s0
 ; GFX10-32-NEXT:    s_xor_b32 s0, exec_lo, s2
+; GFX10-32-NEXT:    s_cbranch_execz BB2_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-32-NEXT:    s_andn2_b32 s1, s1, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB2_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:  BB2_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
 ; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -248,12 +251,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX10-64-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
 ; GFX10-64-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB2_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB2_4
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-64-NEXT:    s_mov_b64 exec, 0
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:  BB2_3: ; %.continue
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
 ; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -289,13 +293,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:    s_cbranch_execz BB3_3
 ; SI-NEXT:  ; %bb.1: ; %.demote
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; SI-NEXT:    s_cbranch_scc0 BB3_4
 ; SI-NEXT:  ; %bb.2: ; %.demote
 ; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
-; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:  BB3_3: ; %.continue
 ; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -317,13 +322,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:    s_cbranch_execz BB3_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote
 ; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX9-NEXT:  ; %bb.2: ; %.demote
 ; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:  BB3_3: ; %.continue
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -345,13 +351,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v1
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:    s_cbranch_execz BB3_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-32-NEXT:    s_wqm_b32 s28, s12
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:  BB3_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
@@ -373,13 +380,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:    s_cbranch_execz BB3_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:  BB3_3: ; %.continue
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
 ; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
@@ -421,13 +429,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:    s_cbranch_execz BB4_3
 ; SI-NEXT:  ; %bb.1: ; %.demote
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; SI-NEXT:    s_cbranch_scc0 BB4_4
 ; SI-NEXT:  ; %bb.2: ; %.demote
 ; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
-; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:  BB4_3: ; %.continue
 ; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v0
 ; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -449,13 +458,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:    s_cbranch_execz BB4_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote
 ; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX9-NEXT:  ; %bb.2: ; %.demote
 ; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:  BB4_3: ; %.continue
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -477,13 +487,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:    s_cbranch_execz BB4_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-32-NEXT:    s_wqm_b32 s28, s12
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:  BB4_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
@@ -505,13 +516,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:    s_cbranch_execz BB4_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:  BB4_3: ; %.continue
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
 ; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -659,13 +671,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; SI-NEXT:    s_cbranch_execz BB6_3
 ; SI-NEXT:  ; %bb.1: ; %.demote0
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB6_7
 ; SI-NEXT:  ; %bb.2: ; %.demote0
 ; SI-NEXT:    s_wqm_b64 s[4:5], s[0:1]
 ; SI-NEXT:    s_and_b64 exec, exec, s[4:5]
-; SI-NEXT:  ; %bb.3: ; %.continue0
+; SI-NEXT:  BB6_3: ; %.continue0
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
@@ -681,12 +694,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; SI-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB6_6
 ; SI-NEXT:  ; %bb.4: ; %.demote1
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB6_7
 ; SI-NEXT:  ; %bb.5: ; %.demote1
 ; SI-NEXT:    s_mov_b64 exec, 0
-; SI-NEXT:  ; %bb.6: ; %.continue1
+; SI-NEXT:  BB6_6: ; %.continue1
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    v_bfrev_b32_e32 v0, 60
 ; SI-NEXT:    v_mov_b32_e32 v1, 0x3c00
@@ -705,13 +719,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz BB6_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote0
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX9-NEXT:  ; %bb.2: ; %.demote0
 ; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
-; GFX9-NEXT:  ; %bb.3: ; %.continue0
+; GFX9-NEXT:  BB6_3: ; %.continue0
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
@@ -727,12 +742,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz BB6_6
 ; GFX9-NEXT:  ; %bb.4: ; %.demote1
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX9-NEXT:  ; %bb.5: ; %.demote1
 ; GFX9-NEXT:    s_mov_b64 exec, 0
-; GFX9-NEXT:  ; %bb.6: ; %.continue1
+; GFX9-NEXT:  BB6_6: ; %.continue1
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -751,13 +767,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-32-NEXT:    s_cbranch_execz BB6_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-32-NEXT:    s_wqm_b32 s2, s0
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-32-NEXT:  BB6_3: ; %.continue0
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-32-NEXT:    s_mov_b32 s1, s0
 ; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
@@ -771,12 +788,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-32-NEXT:    s_or_b32 s1, s1, vcc_lo
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
 ; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX10-32-NEXT:    s_cbranch_execz BB6_6
 ; GFX10-32-NEXT:  ; %bb.4: ; %.demote1
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-32-NEXT:  ; %bb.5: ; %.demote1
 ; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
-; GFX10-32-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-32-NEXT:  BB6_6: ; %.continue1
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -795,13 +813,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10-64-NEXT:    s_cbranch_execz BB6_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-64-NEXT:  BB6_3: ; %.continue0
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
@@ -815,12 +834,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-64-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB6_6
 ; GFX10-64-NEXT:  ; %bb.4: ; %.demote1
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-64-NEXT:  ; %bb.5: ; %.demote1
 ; GFX10-64-NEXT:    s_mov_b64 exec, 0
-; GFX10-64-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-64-NEXT:  BB6_6: ; %.continue1
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -875,13 +895,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB7_3
 ; SI-NEXT:  ; %bb.1: ; %.demote0
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB7_9
 ; SI-NEXT:  ; %bb.2: ; %.demote0
 ; SI-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; SI-NEXT:    s_and_b64 exec, exec, s[6:7]
-; SI-NEXT:  ; %bb.3: ; %.continue0.preheader
+; SI-NEXT:  BB7_3: ; %.continue0.preheader
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch BB7_5
@@ -940,13 +961,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz BB7_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote0
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX9-NEXT:  ; %bb.2: ; %.demote0
 ; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
-; GFX9-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX9-NEXT:  BB7_3: ; %.continue0.preheader
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch BB7_5
@@ -1005,13 +1027,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX10-32-NEXT:    s_cbranch_execz BB7_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-32-NEXT:    s_wqm_b32 s3, s0
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-32-NEXT:  BB7_3: ; %.continue0.preheader
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX10-32-NEXT:    s_mov_b32 s2, 0
 ; GFX10-32-NEXT:    s_branch BB7_5
@@ -1067,13 +1090,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB7_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-64-NEXT:  BB7_3: ; %.continue0.preheader
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX10-64-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX10-64-NEXT:    s_branch BB7_5
diff --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir
index 0f0d210799a92e..3dddb0fef23033 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir
+++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=10 -verify-machineinstrs  %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs  %s -o - | FileCheck %s
 # Make sure mandatory skips are not removed around mode defs.
 # FIXME: -amdgpu-skip-threshold seems to be backwards.
 
diff --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir
index ee72fa99a12975..58b1ab9ace01a5 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir
+++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=10 -verify-machineinstrs  %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs  %s -o - | FileCheck %s
 # Make sure mandatory skips are not removed around mode defs.
 # FIXME: -amdgpu-skip-threshold seems to be backwards.
 
diff --git a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
index 5979720d0cc7a9..4c53c51d1ce4e1 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
+++ b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=1000000 -o -  %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=1000000 -o -  %s | FileCheck %s
 
 ---
 name: skip_branch_taildup_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index f535e28c671850..690fe5a7e68329 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1002,13 +1002,14 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; SI-NEXT:    v_cmp_nle_f32_e32 vcc, 0, v1
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB13_3
 ; SI-NEXT:  ; %bb.1: ; %bb3
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], vcc
 ; SI-NEXT:    s_cbranch_scc0 BB13_6
 ; SI-NEXT:  ; %bb.2: ; %bb3
 ; SI-NEXT:    s_andn2_b64 exec, exec, vcc
-; SI-NEXT:  ; %bb.3: ; %bb4
+; SI-NEXT:  BB13_3: ; %bb4
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_mov_b32 s1, s0
 ; SI-NEXT:    s_mov_b32 s2, s0
@@ -1043,13 +1044,14 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; GFX10-WAVE64-NEXT:    s_mov_b32 s0, 0
 ; GFX10-WAVE64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX10-WAVE64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-WAVE64-NEXT:    s_cbranch_execz BB13_3
 ; GFX10-WAVE64-NEXT:  ; %bb.1: ; %bb3
 ; GFX10-WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 s[2:3], s[2:3], vcc
 ; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB13_6
 ; GFX10-WAVE64-NEXT:  ; %bb.2: ; %bb3
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 exec, exec, vcc
-; GFX10-WAVE64-NEXT:  ; %bb.3: ; %bb4
+; GFX10-WAVE64-NEXT:  BB13_3: ; %bb4
 ; GFX10-WAVE64-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX10-WAVE64-NEXT:    s_mov_b32 s1, s0
 ; GFX10-WAVE64-NEXT:    s_mov_b32 s2, s0
@@ -1082,13 +1084,14 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; GFX10-WAVE32-NEXT:    s_mov_b32 s0, 0
 ; GFX10-WAVE32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX10-WAVE32-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX10-WAVE32-NEXT:    s_cbranch_execz BB13_3
 ; GFX10-WAVE32-NEXT:  ; %bb.1: ; %bb3
 ; GFX10-WAVE32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
 ; GFX10-WAVE32-NEXT:    s_andn2_b32 s1, s1, vcc_lo
 ; GFX10-WAVE32-NEXT:    s_cbranch_scc0 BB13_6
 ; GFX10-WAVE32-NEXT:  ; %bb.2: ; %bb3
 ; GFX10-WAVE32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
-; GFX10-WAVE32-NEXT:  ; %bb.3: ; %bb4
+; GFX10-WAVE32-NEXT:  BB13_3: ; %bb4
 ; GFX10-WAVE32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX10-WAVE32-NEXT:    s_mov_b32 s1, s0
 ; GFX10-WAVE32-NEXT:    s_mov_b32 s2, s0
@@ -1154,12 +1157,13 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; SI-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v1
 ; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; SI-NEXT:    s_cbranch_execz BB14_3
 ; SI-NEXT:  ; %bb.1: ; %kill
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB14_6
 ; SI-NEXT:  ; %bb.2: ; %kill
 ; SI-NEXT:    s_mov_b64 exec, 0
-; SI-NEXT:  ; %bb.3: ; %Flow
+; SI-NEXT:  BB14_3: ; %Flow
 ; SI-NEXT:    s_or_saveexec_b64 s[0:1], s[2:3]
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    s_xor_b64 exec, exec, s[0:1]
@@ -1190,12 +1194,13 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; GFX10-WAVE64-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v1
 ; GFX10-WAVE64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX10-WAVE64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10-WAVE64-NEXT:    s_cbranch_execz BB14_3
 ; GFX10-WAVE64-NEXT:  ; %bb.1: ; %kill
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB14_6
 ; GFX10-WAVE64-NEXT:  ; %bb.2: ; %kill
 ; GFX10-WAVE64-NEXT:    s_mov_b64 exec, 0
-; GFX10-WAVE64-NEXT:  ; %bb.3: ; %Flow
+; GFX10-WAVE64-NEXT:  BB14_3: ; %Flow
 ; GFX10-WAVE64-NEXT:    s_or_saveexec_b64 s[0:1], s[2:3]
 ; GFX10-WAVE64-NEXT:    ; implicit-def: $vgpr2
 ; GFX10-WAVE64-NEXT:    s_xor_b64 exec, exec, s[0:1]
@@ -1226,12 +1231,13 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; GFX10-WAVE32-NEXT:    v_cmp_ge_f32_e32 vcc_lo, 0, v1
 ; GFX10-WAVE32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-WAVE32-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-WAVE32-NEXT:    s_cbranch_execz BB14_3
 ; GFX10-WAVE32-NEXT:  ; %bb.1: ; %kill
 ; GFX10-WAVE32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-WAVE32-NEXT:    s_cbranch_scc0 BB14_6
 ; GFX10-WAVE32-NEXT:  ; %bb.2: ; %kill
 ; GFX10-WAVE32-NEXT:    s_mov_b32 exec_lo, 0
-; GFX10-WAVE32-NEXT:  ; %bb.3: ; %Flow
+; GFX10-WAVE32-NEXT:  BB14_3: ; %Flow
 ; GFX10-WAVE32-NEXT:    s_or_saveexec_b32 s0, s1
 ; GFX10-WAVE32-NEXT:    ; implicit-def: $vgpr2
 ; GFX10-WAVE32-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index 9f42347f5ec856..e5a019e5d04afa 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -108,25 +108,26 @@ define amdgpu_ps { <4 x float> } @test_return_to_epilog_with_optimized_kill(floa
   ; GCN:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
   ; GCN:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GCN: bb.4.Flow1:
-  ; GCN:   successors: %bb.5(0x40000000)
+  ; GCN:   successors: %bb.5(0x40000000), %bb.7(0x40000000)
   ; GCN:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
   ; GCN:   renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GCN:   $exec = S_XOR_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc
+  ; GCN:   S_CBRANCH_EXECZ %bb.7, implicit $exec
   ; GCN: bb.5.kill0:
-  ; GCN:   successors: %bb.8(0x40000000), %bb.7(0x40000000)
+  ; GCN:   successors: %bb.6(0x40000000), %bb.8(0x40000000)
   ; GCN:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
   ; GCN:   dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc
-  ; GCN:   S_CBRANCH_SCC0 %bb.7, implicit $scc
-  ; GCN: bb.8.kill0:
-  ; GCN:   successors: %bb.6(0x80000000)
+  ; GCN:   S_CBRANCH_SCC0 %bb.8, implicit $scc
+  ; GCN: bb.6.kill0:
+  ; GCN:   successors: %bb.7(0x80000000)
   ; GCN:   liveins: $sgpr2_sgpr3, $scc
   ; GCN:   $exec = S_MOV_B64 0
-  ; GCN: bb.6.end:
+  ; GCN: bb.7.end:
   ; GCN:   successors: %bb.9(0x80000000)
   ; GCN:   liveins: $sgpr2_sgpr3
   ; GCN:   $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
   ; GCN:   S_BRANCH %bb.9
-  ; GCN: bb.7:
+  ; GCN: bb.8:
   ; GCN:   $exec = S_MOV_B64 0
   ; GCN:   EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
   ; GCN:   S_ENDPGM 0
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index 6b50aa58bd6bdf..3693e706bec4dd 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -222,7 +222,6 @@ static_library("LLVMAMDGPUCodeGen") {
     "SIPreEmitPeephole.cpp",
     "SIProgramInfo.cpp",
     "SIRegisterInfo.cpp",
-    "SIRemoveShortExecBranches.cpp",
     "SIShrinkInstructions.cpp",
     "SIWholeQuadMode.cpp",
   ]